forked from scrapy/url-chromium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
url_canon_unittest.cc
2402 lines (2165 loc) · 112 KB
/
url_canon_unittest.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <errno.h>
#include <stddef.h>
#include "base/macros.h"
#include "base/strings/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
#include "url/url_canon_stdstring.h"
#include "url/url_test_utils.h"
namespace url {
namespace {
struct ComponentCase {
const char* input;
const char* expected;
Component expected_component;
bool expected_success;
};
// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
// treat each input as optional, and will only try processing if non-NULL.
// The output is always 8-bit.
struct DualComponentCase {
const char* input8;
const wchar_t* input16;
const char* expected;
Component expected_component;
bool expected_success;
};
// Test cases for CanonicalizeIPAddress(). The inputs are identical to
// DualComponentCase, but the output has extra CanonHostInfo fields.
struct IPAddressCase {
const char* input8;
const wchar_t* input16;
const char* expected;
Component expected_component;
// CanonHostInfo fields, for verbose output.
CanonHostInfo::Family expected_family;
int expected_num_ipv4_components;
const char* expected_address_hex; // Two hex chars per IP address byte.
};
std::string BytesToHexString(unsigned char bytes[16], int length) {
EXPECT_TRUE(length == 0 || length == 4 || length == 16)
<< "Bad IP address length: " << length;
std::string result;
for (int i = 0; i < length; ++i) {
result.push_back(kHexCharLookup[(bytes[i] >> 4) & 0xf]);
result.push_back(kHexCharLookup[bytes[i] & 0xf]);
}
return result;
}
struct ReplaceCase {
const char* base;
const char* scheme;
const char* username;
const char* password;
const char* host;
const char* port;
const char* path;
const char* query;
const char* ref;
const char* expected;
};
// Magic string used in the replacements code that tells SetupReplComp to
// call the clear function.
const char kDeleteComp[] = "|";
// Sets up a replacement for a single component. This is given pointers to
// the set and clear function for the component being replaced, and will
// either set the component (if it exists) or clear it (if the replacement
// string matches kDeleteComp).
//
// This template is currently used only for the 8-bit case, and the strlen
// causes it to fail in other cases. It is left a template in case we have
// tests for wide replacements.
template<typename CHAR>
void SetupReplComp(
void (Replacements<CHAR>::*set)(const CHAR*, const Component&),
void (Replacements<CHAR>::*clear)(),
Replacements<CHAR>* rep,
const CHAR* str) {
if (str && str[0] == kDeleteComp[0]) {
(rep->*clear)();
} else if (str) {
(rep->*set)(str, Component(0, static_cast<int>(strlen(str))));
}
}
} // namespace
TEST(URLCanonTest, DoAppendUTF8) {
struct UTF8Case {
unsigned input;
const char* output;
} utf_cases[] = {
// Valid code points.
{0x24, "\x24"},
{0xA2, "\xC2\xA2"},
{0x20AC, "\xE2\x82\xAC"},
{0x24B62, "\xF0\xA4\xAD\xA2"},
{0x10FFFF, "\xF4\x8F\xBF\xBF"},
};
std::string out_str;
for (size_t i = 0; i < arraysize(utf_cases); i++) {
out_str.clear();
StdStringCanonOutput output(&out_str);
AppendUTF8Value(utf_cases[i].input, &output);
output.Complete();
EXPECT_EQ(utf_cases[i].output, out_str);
}
}
#if defined(GTEST_HAS_DEATH_TEST)
// TODO(mattm): Can't run this in debug mode for now, since the DCHECK will
// cause the Chromium stack trace dialog to appear and hang the test.
// See http://crbug.com/49580.
#if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON)
#define MAYBE_DoAppendUTF8Invalid DoAppendUTF8Invalid
#else
#define MAYBE_DoAppendUTF8Invalid DISABLED_DoAppendUTF8Invalid
#endif
TEST(URLCanonTest, MAYBE_DoAppendUTF8Invalid) {
std::string out_str;
StdStringCanonOutput output(&out_str);
// Invalid code point (too large).
ASSERT_DEBUG_DEATH({
AppendUTF8Value(0x110000, &output);
output.Complete();
EXPECT_EQ("", out_str);
}, "");
}
#endif // defined(GTEST_HAS_DEATH_TEST)
TEST(URLCanonTest, UTF) {
// Low-level test that we handle reading, canonicalization, and writing
// UTF-8/UTF-16 strings properly.
struct UTFCase {
const char* input8;
const wchar_t* input16;
bool expected_success;
const char* output;
} utf_cases[] = {
// Valid canonical input should get passed through & escaped.
{"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
// Test a character that takes > 16 bits (U+10300 = old italic letter A)
{"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
// Non-shortest-form UTF-8 characters are invalid. The bad bytes should
// each be replaced with the invalid character (EF BF DB in UTF-8).
{"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false,
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"},
// Invalid UTF-8 sequences should be marked as invalid (the first
// sequence is truncated).
{"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
// Character going off the end.
{"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
// ...same with low surrogates with no high surrogate.
{nullptr, L"\xdc00", false, "%EF%BF%BD"},
// Test a UTF-8 encoded surrogate value is marked as invalid.
// ED A0 80 = U+D800
{"\xed\xa0\x80", NULL, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
// ...even when paired.
{"\xed\xa0\x80\xed\xb0\x80", nullptr, false,
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
};
std::string out_str;
for (size_t i = 0; i < arraysize(utf_cases); i++) {
if (utf_cases[i].input8) {
out_str.clear();
StdStringCanonOutput output(&out_str);
int input_len = static_cast<int>(strlen(utf_cases[i].input8));
bool success = true;
for (int ch = 0; ch < input_len; ch++) {
success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len,
&output);
}
output.Complete();
EXPECT_EQ(utf_cases[i].expected_success, success);
EXPECT_EQ(std::string(utf_cases[i].output), out_str);
}
if (utf_cases[i].input16) {
out_str.clear();
StdStringCanonOutput output(&out_str);
base::string16 input_str(
test_utils::TruncateWStringToUTF16(utf_cases[i].input16));
int input_len = static_cast<int>(input_str.length());
bool success = true;
for (int ch = 0; ch < input_len; ch++) {
success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
&output);
}
output.Complete();
EXPECT_EQ(utf_cases[i].expected_success, success);
EXPECT_EQ(std::string(utf_cases[i].output), out_str);
}
if (utf_cases[i].input8 && utf_cases[i].input16 &&
utf_cases[i].expected_success) {
// Check that the UTF-8 and UTF-16 inputs are equivalent.
// UTF-16 -> UTF-8
std::string input8_str(utf_cases[i].input8);
base::string16 input16_str(
test_utils::TruncateWStringToUTF16(utf_cases[i].input16));
EXPECT_EQ(input8_str, base::UTF16ToUTF8(input16_str));
// UTF-8 -> UTF-16
EXPECT_EQ(input16_str, base::UTF8ToUTF16(input8_str));
}
}
}
TEST(URLCanonTest, Scheme) {
// Here, we're mostly testing that unusual characters are handled properly.
// The canonicalizer doesn't do any parsing or whitespace detection. It will
// also do its best on error, and will escape funny sequences (these won't be
// valid schemes and it will return error).
//
// Note that the canonicalizer will append a colon to the output to separate
// out the rest of the URL, which is not present in the input. We check,
// however, that the output range includes everything but the colon.
ComponentCase scheme_cases[] = {
{"http", "http:", Component(0, 4), true},
{"HTTP", "http:", Component(0, 4), true},
{" HTTP ", "%20http%20:", Component(0, 10), false},
{"htt: ", "htt%3A%20:", Component(0, 9), false},
{"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false},
// Don't re-escape something already escaped. Note that it will
// "canonicalize" the 'A' to 'a', but that's OK.
{"ht%3Atp", "ht%3atp:", Component(0, 7), false},
{"", ":", Component(0, 0), false},
};
std::string out_str;
for (size_t i = 0; i < arraysize(scheme_cases); i++) {
int url_len = static_cast<int>(strlen(scheme_cases[i].input));
Component in_comp(0, url_len);
Component out_comp;
out_str.clear();
StdStringCanonOutput output1(&out_str);
bool success = CanonicalizeScheme(scheme_cases[i].input, in_comp, &output1,
&out_comp);
output1.Complete();
EXPECT_EQ(scheme_cases[i].expected_success, success);
EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
// Now try the wide version.
out_str.clear();
StdStringCanonOutput output2(&out_str);
base::string16 wide_input(base::UTF8ToUTF16(scheme_cases[i].input));
in_comp.len = static_cast<int>(wide_input.length());
success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2,
&out_comp);
output2.Complete();
EXPECT_EQ(scheme_cases[i].expected_success, success);
EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
}
// Test the case where the scheme is declared nonexistent, it should be
// converted into an empty scheme.
Component out_comp;
out_str.clear();
StdStringCanonOutput output(&out_str);
EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp));
output.Complete();
EXPECT_EQ(std::string(":"), out_str);
EXPECT_EQ(0, out_comp.begin);
EXPECT_EQ(0, out_comp.len);
}
TEST(URLCanonTest, Host) {
IPAddressCase host_cases[] = {
// Basic canonicalization, uppercase should be converted to lowercase.
{"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
// Spaces and some other characters should be escaped.
{"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""},
// Exciting different types of spaces!
{NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
// Other types of space (no-break, zero-width, zero-width-no-break) are
// name-prepped away to nothing.
{NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
// Ideographic full stop (full-width period for Chinese, etc.) should be
// treated as a dot.
{NULL, L"www.foo\x3002" L"bar.com", "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
// Invalid unicode characters should fail...
// ...In wide input, ICU will barf and we'll end up with the input as
// escaped UTF-8 (the invalid character should be replaced with the
// replacement character).
{"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
// ...This is the same as previous but with with escaped.
{"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
// Test name prepping, fullwidth input should be converted to ASCII and NOT
// IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
{"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
// Test that fullwidth escaped values are properly name-prepped,
// then converted or rejected.
// ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
{"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
{"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
// ...%00 in fullwidth should fail (also as escaped UTF-8 input)
{"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
{"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
// ICU will convert weird percents into ASCII percents, but not unescape
// further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
// "small percent". At this point we should be within our rights to mark
// anything as invalid since the URL is corrupt or malicious. The code
// happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
// and kept as valid, so we validate that behavior here, but this level
// of fixing the input shouldn't be seen as required. "%81" is invalid.
{"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
{"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
{"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
{"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
// Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
{"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
// See http://unicode.org/cldr/utility/idna.jsp for other
// examples/experiments and http://goo.gl/7yG11o
// for the full list of characters handled differently by
// IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008.
// 4 Deviation characters are mapped/ignored in UTS 46 transitional
// mechansm. UTS 46, table 4 row (g).
// Sharp-s is mapped to 'ss' in UTS 46 and IDNA 2003.
// Otherwise, it'd be "xn--fuball-cta.de".
{"fu\xc3\x9f" "ball.de", L"fu\x00df" L"ball.de", "fussball.de",
Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
// Final-sigma (U+03C3) is mapped to regular sigma (U+03C2).
// Otherwise, it'd be "xn--wxaijb9b".
{"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
"xn--wxaikc6b", Component(0, 12),
CanonHostInfo::NEUTRAL, -1, ""},
// ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional
// handling as well as in IDNA 2003.
{"a\xe2\x80\x8c" "b\xe2\x80\x8d" "c", L"a\x200c" L"b\x200d" L"c", "abc",
Component(0, 3), CanonHostInfo::NEUTRAL, -1, ""},
// ZWJ between Devanagari characters is still mapped away in UTS 46
// transitional handling. IDNA 2008 would give xn--11bo0mv54g.
{"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
L"\x915\x94d\x200d\x91c", "xn--11bo0m",
Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
// Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b)
// However, we do allow this at the moment because we don't use
// STD3 rules and canonicalize full-width ASCII to ASCII.
{"wow\xef\xbc\x81", L"wow\xff01", "wow%21",
Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
// U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c)
// Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
{"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo",
Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
// U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d)
// Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
{"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
"%F0%AF%A1%A8%E5%A7%BB.cn",
Component(0, 24), CanonHostInfo::BROKEN, -1, ""},
// Maps uppercase letters to lower case letters. UTS 46 table 4 row (e)
{"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya",
Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
// An already-IDNA host is not modified.
{"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya",
Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
// Symbol/punctuations are allowed in IDNA 2003/UTS46.
// Not allowed in IDNA 2008. UTS 46 table 4 row (f).
{"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us",
Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""},
// U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h)
// We used to allow it because we passed through unassigned code points.
{"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
// U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i)
// Used to be allowed in INDA 2003.
{"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg",
Component(0, 9), CanonHostInfo::BROKEN, -1, ""},
// U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based
// on Unicode 3.2). We did allow it in the past because we let unassigned
// code point pass. We continue to allow it even though it's a
// "punctuation and symbol" blocked in IDNA 2008.
// UTS 46 table 4, row (j)
{"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com",
Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
// Maps uppercase letters to lower case letters.
// In IDNA 2003, it's allowed without case-folding
// ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2
// (added in Unicode 4.1). UTS 46 table 4 row (k)
{"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com",
Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
// Maps U+FF43 (Full Width Small Letter C) to 'c'.
{"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz",
Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
// Maps U+1D68C (Math Monospace Small C) to 'c'.
// U+1D68C = \xD835\xDE8C in UTF-16
{"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
// BiDi check test
// "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
// Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
{"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw",
Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""},
// Disallowed in both IDNA 2003 and 2008 with BiDi check.
// Labels starting with a RTL character cannot end with a LTR character.
{"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
"%D8%AC%D8%A7%D8%B1xyz", Component(0, 21),
CanonHostInfo::BROKEN, -1, ""},
// Labels starting with a RTL character can end with BC=EN (European
// number). Disallowed in IDNA 2003 but now allowed.
{"\xd8\xac\xd8\xa7\xd8\xb1" "2", L"\x62c\x627\x631" L"2",
"xn--2-ymcov", Component(0, 11),
CanonHostInfo::NEUTRAL, -1, ""},
// Labels starting with a RTL character cannot have "L" characters
// even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008.
{"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
"%D8%AC%D8%A7%D8%B1xy2", Component(0, 21),
CanonHostInfo::BROKEN, -1, ""},
// Labels starting with a RTL character can end with BC=AN (Arabic number)
// Disallowed in IDNA 2003, but now allowed.
{"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
"xn--mgbjq0r", Component(0, 11),
CanonHostInfo::NEUTRAL, -1, ""},
// Labels starting with a RTL character cannot have "L" characters
// even if it ends with an BC=AN (Arabic number).
// Disallowed in both IDNA 2003/2008.
{"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
"%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26),
CanonHostInfo::BROKEN, -1, ""},
// Labels starting with a RTL character cannot mix BC=EN and BC=AN
{"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
"%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27),
CanonHostInfo::BROKEN, -1, ""},
// As of Unicode 6.2, U+20CF is not assigned. We do not allow it.
{"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com",
Component(0, 13), CanonHostInfo::BROKEN, -1, ""},
// U+0080 is not allowed.
{"\xc2\x80.com", L"\x80.com", "%C2%80.com",
Component(0, 10), CanonHostInfo::BROKEN, -1, ""},
// Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
// Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
// UTF-8 (wide case). The output should be equivalent to the true wide
// character input above).
{"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba",
Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
// Invalid escaped characters should fail and the percents should be
// escaped.
{"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
CanonHostInfo::BROKEN, -1, ""},
// If we get an invalid character that has been escaped.
{"%25", L"%25", "%25", Component(0, 3),
CanonHostInfo::BROKEN, -1, ""},
{"hello%00", L"hello%00", "hello%00", Component(0, 8),
CanonHostInfo::BROKEN, -1, ""},
// Escaped numbers should be treated like IP addresses if they are.
{"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
"192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3,
"C0A80001"},
{"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e",
"192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3,
"C0A80001"},
// Invalid escaping should trigger the regular host error handling.
{"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
// Something that isn't exactly an IP should get treated as a host and
// spaces escaped.
{"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
// Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
// These are "0Xc0.0250.01" in fullwidth.
{"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
// Broken IP addresses get marked as such.
{"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13), CanonHostInfo::BROKEN, -1, ""},
{"[google.com]", L"[google.com]", "[google.com]", Component(0, 12), CanonHostInfo::BROKEN, -1, ""},
// Cyrillic letter followed by '(' should return punycode for '(' escaped
// before punycode string was created. I.e.
// if '(' is escaped after punycode is created we would get xn--%28-8tb
// (incorrect).
{"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11),
CanonHostInfo::NEUTRAL, -1, ""},
// Address with all hexidecimal characters with leading number of 1<<32
// or greater and should return NEUTRAL rather than BROKEN if not all
// components are numbers.
{"12345678912345.de", L"12345678912345.de", "12345678912345.de", Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
{"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
{"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de", "12345678912345.12345678912345.de", Component(0, 32), CanonHostInfo::NEUTRAL, -1, ""},
{"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de", Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
{"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde", Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
// A label that starts with "xn--" but contains non-ASCII characters should
// be an error. Escape the invalid characters.
{"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
};
// CanonicalizeHost() non-verbose.
std::string out_str;
for (size_t i = 0; i < arraysize(host_cases); i++) {
// Narrow version.
if (host_cases[i].input8) {
int host_len = static_cast<int>(strlen(host_cases[i].input8));
Component in_comp(0, host_len);
Component out_comp;
out_str.clear();
StdStringCanonOutput output(&out_str);
bool success = CanonicalizeHost(host_cases[i].input8, in_comp, &output,
&out_comp);
output.Complete();
EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
success) << "for input: " << host_cases[i].input8;
EXPECT_EQ(std::string(host_cases[i].expected), out_str) <<
"for input: " << host_cases[i].input8;
EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin) <<
"for input: " << host_cases[i].input8;
EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len) <<
"for input: " << host_cases[i].input8;
}
// Wide version.
if (host_cases[i].input16) {
base::string16 input16(
test_utils::TruncateWStringToUTF16(host_cases[i].input16));
int host_len = static_cast<int>(input16.length());
Component in_comp(0, host_len);
Component out_comp;
out_str.clear();
StdStringCanonOutput output(&out_str);
bool success = CanonicalizeHost(input16.c_str(), in_comp, &output,
&out_comp);
output.Complete();
EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
success);
EXPECT_EQ(std::string(host_cases[i].expected), out_str);
EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
}
}
// CanonicalizeHostVerbose()
for (size_t i = 0; i < arraysize(host_cases); i++) {
// Narrow version.
if (host_cases[i].input8) {
int host_len = static_cast<int>(strlen(host_cases[i].input8));
Component in_comp(0, host_len);
out_str.clear();
StdStringCanonOutput output(&out_str);
CanonHostInfo host_info;
CanonicalizeHostVerbose(host_cases[i].input8, in_comp, &output,
&host_info);
output.Complete();
EXPECT_EQ(host_cases[i].expected_family, host_info.family);
EXPECT_EQ(std::string(host_cases[i].expected), out_str);
EXPECT_EQ(host_cases[i].expected_component.begin,
host_info.out_host.begin);
EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
BytesToHexString(host_info.address, host_info.AddressLength()));
if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
host_info.num_ipv4_components);
}
}
// Wide version.
if (host_cases[i].input16) {
base::string16 input16(
test_utils::TruncateWStringToUTF16(host_cases[i].input16));
int host_len = static_cast<int>(input16.length());
Component in_comp(0, host_len);
out_str.clear();
StdStringCanonOutput output(&out_str);
CanonHostInfo host_info;
CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info);
output.Complete();
EXPECT_EQ(host_cases[i].expected_family, host_info.family);
EXPECT_EQ(std::string(host_cases[i].expected), out_str);
EXPECT_EQ(host_cases[i].expected_component.begin,
host_info.out_host.begin);
EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
BytesToHexString(host_info.address, host_info.AddressLength()));
if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
host_info.num_ipv4_components);
}
}
}
}
TEST(URLCanonTest, IPv4) {
IPAddressCase cases[] = {
// Empty is not an IP address.
{"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
{".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// Regular IP addresses in different bases.
{"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
{"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
{"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
// Non-IP addresses due to invalid characters.
{"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// Invalid characters for the base should be rejected.
{"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
{"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
{"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// If there are not enough components, the last one should fill them out.
{"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"},
{"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
{"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
{"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
{"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"},
{"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
{"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
{"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
// Too many components means not an IP address.
{"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// We allow a single trailing dot.
{"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
{"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
{"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// Two dots in a row means not an IP address.
{"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// Any numerical overflow should be marked as BROKEN.
{"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Repeat the previous tests, minus 1, to verify boundaries.
{"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"},
{"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"},
{"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"},
{"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"},
{"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"},
{"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"},
{"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
{"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
{"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
// Old trunctations tests. They're all "BROKEN" now.
{"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Spaces should be rejected.
{"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// Very large numbers.
{"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"},
{"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
// A number has no length limit, but long numbers can still overflow.
{"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"},
{"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// If a long component is non-numeric, it's a hostname, *not* a broken IP.
{"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
{"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// Truncation of all zeros should still result in 0.
{"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"},
};
for (size_t i = 0; i < arraysize(cases); i++) {
// 8-bit version.
Component component(0, static_cast<int>(strlen(cases[i].input8)));
std::string out_str1;
StdStringCanonOutput output1(&out_str1);
CanonHostInfo host_info;
CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
output1.Complete();
EXPECT_EQ(cases[i].expected_family, host_info.family);
EXPECT_EQ(std::string(cases[i].expected_address_hex),
BytesToHexString(host_info.address, host_info.AddressLength()));
if (host_info.family == CanonHostInfo::IPV4) {
EXPECT_STREQ(cases[i].expected, out_str1.c_str());
EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
EXPECT_EQ(cases[i].expected_num_ipv4_components,
host_info.num_ipv4_components);
}
// 16-bit version.
base::string16 input16(
test_utils::TruncateWStringToUTF16(cases[i].input16));
component = Component(0, static_cast<int>(input16.length()));
std::string out_str2;
StdStringCanonOutput output2(&out_str2);
CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
output2.Complete();
EXPECT_EQ(cases[i].expected_family, host_info.family);
EXPECT_EQ(std::string(cases[i].expected_address_hex),
BytesToHexString(host_info.address, host_info.AddressLength()));
if (host_info.family == CanonHostInfo::IPV4) {
EXPECT_STREQ(cases[i].expected, out_str2.c_str());
EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
EXPECT_EQ(cases[i].expected_num_ipv4_components,
host_info.num_ipv4_components);
}
}
}
TEST(URLCanonTest, IPv6) {
IPAddressCase cases[] = {
// Empty is not an IP address.
{"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
// Non-IPs with [:] characters are marked BROKEN.
{":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Regular IP address is invalid without bounding '[' and ']'.
{"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Regular IP addresses.
{"[::]", L"[::]", "[::]", Component(0,4), CanonHostInfo::IPV6, -1, "00000000000000000000000000000000"},
{"[::1]", L"[::1]", "[::1]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000001"},
{"[1::]", L"[1::]", "[1::]", Component(0,5), CanonHostInfo::IPV6, -1, "00010000000000000000000000000000"},
// Leading zeros should be stripped.
{"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", Component(0,17), CanonHostInfo::IPV6, -1, "00000001000200030004000500060007"},
// Upper case letters should be lowercased.
{"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", Component(0,20), CanonHostInfo::IPV6, -1, "000A000B000C00DE00FF0000000100AC"},
// The same address can be written with different contractions, but should
// get canonicalized to the same thing.
{"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
{"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
// Addresses with embedded IPv4.
{"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0,10), CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"},
{"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
{"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1, "00000000000000000000EEEEC0A80001"},
{"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]", Component(0, 14), CanonHostInfo::IPV6, -1, "200100000000000000000000C0A80001"},
{"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// IPv4 with last component missing.
{"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0010002"},
// IPv4 using hex.
// TODO(eroman): Should this format be disallowed?
{"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
// There may be zeros surrounding the "::" contraction.
{"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"},
{"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
// Can only have one "::" contraction in an IPv6 string literal.
{"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// No more than 2 consecutive ':'s.
{"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Non-IP addresses due to invalid characters.
{"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// If there are not enough components, the last one should fill them out.
// ... omitted at this time ...
// Too many components means not an IP address. Similarly, with too few
// if using IPv4 compat or mapped addresses.
{"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Too many bits (even though 8 comonents, the last one holds 32 bits).
{"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Too many bits specified -- the contraction would have to be zero-length
// to not exceed 128 bits.
{"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// The contraction is for 16 bits of zero.
{"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", Component(0,17), CanonHostInfo::IPV6, -1, "00010002000300040005000600000008"},
// Cannot have a trailing colon.
{"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Cannot have negative numbers.
{"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
// The scope_id should be included in the canonicalized URL, and is an
// unsigned decimal number.
// Invalid because no ID was given after the percent.
// Don't allow scope-id
{"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Don't allow leading or trailing colons.
{"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// We allow a single trailing dot.
// ... omitted at this time ...
// Two dots in a row means not an IP address.
{"[::192.168..1]", L"[::192.168..1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
// Any non-first components get truncated to one byte.
// ... omitted at this time ...
// Spaces should be rejected.
{"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
};
for (size_t i = 0; i < arraysize(cases); i++) {
// 8-bit version.
Component component(0, static_cast<int>(strlen(cases[i].input8)));
std::string out_str1;
StdStringCanonOutput output1(&out_str1);
CanonHostInfo host_info;
CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
output1.Complete();
EXPECT_EQ(cases[i].expected_family, host_info.family);
EXPECT_EQ(std::string(cases[i].expected_address_hex),
BytesToHexString(host_info.address, host_info.AddressLength())) << "iter " << i << " host " << cases[i].input8;
if (host_info.family == CanonHostInfo::IPV6) {
EXPECT_STREQ(cases[i].expected, out_str1.c_str());
EXPECT_EQ(cases[i].expected_component.begin,
host_info.out_host.begin);
EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
}
// 16-bit version.
base::string16 input16(
test_utils::TruncateWStringToUTF16(cases[i].input16));
component = Component(0, static_cast<int>(input16.length()));
std::string out_str2;
StdStringCanonOutput output2(&out_str2);
CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
output2.Complete();
EXPECT_EQ(cases[i].expected_family, host_info.family);
EXPECT_EQ(std::string(cases[i].expected_address_hex),
BytesToHexString(host_info.address, host_info.AddressLength()));
if (host_info.family == CanonHostInfo::IPV6) {
EXPECT_STREQ(cases[i].expected, out_str2.c_str());
EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
}
}
}
TEST(URLCanonTest, IPEmpty) {
std::string out_str1;
StdStringCanonOutput output1(&out_str1);
CanonHostInfo host_info;
// This tests tests.
const char spec[] = "192.168.0.1";
CanonicalizeIPAddress(spec, Component(), &output1, &host_info);
EXPECT_FALSE(host_info.IsIPAddress());
CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info);
EXPECT_FALSE(host_info.IsIPAddress());
}
// Verifies that CanonicalizeHostSubstring produces the expected output and
// does not "fix" IP addresses. Because this code is a subset of
// CanonicalizeHost, the shared functionality is not tested.
TEST(URLCanonTest, CanonicalizeHostSubstring) {
// Basic sanity check.
{
std::string out_str;
StdStringCanonOutput output(&out_str);
EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com",
Component(0, 12), &output));
output.Complete();
EXPECT_EQ("xn--mnchen-3ya.com", out_str);
}
// Failure case.
{
std::string out_str;
StdStringCanonOutput output(&out_str);
EXPECT_FALSE(CanonicalizeHostSubstring(
test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(),
Component(0, 8), &output));
output.Complete();
EXPECT_EQ("%EF%BF%BDzyx.com", out_str);
}
// Should return true for empty input strings.
{
std::string out_str;
StdStringCanonOutput output(&out_str);
EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output));
output.Complete();
EXPECT_EQ(std::string(), out_str);
}
// Numbers that look like IP addresses should not be changed.
{
std::string out_str;
StdStringCanonOutput output(&out_str);
EXPECT_TRUE(
CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output));
output.Complete();
EXPECT_EQ("01.02.03.04", out_str);
}
}
TEST(URLCanonTest, UserInfo) {
// Note that the canonicalizer should escape and treat empty components as
// not being there.
// We actually parse a full input URL so we can get the initial components.
struct UserComponentCase {
const char* input;
const char* expected;
Component expected_username;
Component expected_password;
bool expected_success;
} user_info_cases[] = {
{"http://user:[email protected]/", "user:pass@", Component(0, 4), Component(5, 4), true},
{"http://@host.com/", "", Component(0, -1), Component(0, -1), true},
{"http://:@host.com/", "", Component(0, -1), Component(0, -1), true},
{"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true},
{"http://:[email protected]/", ":foo@", Component(0, 0), Component(1, 3), true},
{"http://^ :$\[email protected]/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true},
{"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
{"http://%2540:[email protected]/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
// IE7 compatibility: old versions allowed backslashes in usernames, but
// IE7 does not. We disallow it as well.
{"ftp://me\\mydomain:[email protected]/", "", Component(0, -1), Component(0, -1), true},
};
for (size_t i = 0; i < arraysize(user_info_cases); i++) {
int url_len = static_cast<int>(strlen(user_info_cases[i].input));
Parsed parsed;
ParseStandardURL(user_info_cases[i].input, url_len, &parsed);
Component out_user, out_pass;
std::string out_str;
StdStringCanonOutput output1(&out_str);
bool success = CanonicalizeUserInfo(user_info_cases[i].input,
parsed.username,
user_info_cases[i].input,
parsed.password,
&output1,
&out_user,
&out_pass);
output1.Complete();
EXPECT_EQ(user_info_cases[i].expected_success, success);
EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
// Now try the wide version
out_str.clear();
StdStringCanonOutput output2(&out_str);