Skip to content

Commit 5d1f1ac

Browse files
committed
[url-detector] Fixing various bugs
1 parent ae20422 commit 5d1f1ac

File tree

11 files changed

+137
-45
lines changed

11 files changed

+137
-45
lines changed

‎url-detector/src/main/java/com/linkedin/urls/detection/CharUtils.java‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ public static boolean isDot(char a) {
5858
return (a == '.' || a == '\u3002' || a == '\uFF0E' || a == '\uFF61');
5959
}
6060

61+
public static boolean isWhiteSpace(char a) {
62+
return (a == '\n' || a == '\t' || a == '\r' || a == ' ');
63+
}
64+
6165
/**
6266
* Splits a string without the use of a regex, which could split either by isDot() or %2e
6367
* @param input the input string that will be split by dot
@@ -83,6 +87,5 @@ public static String[] splitByDot(String input) {
8387
}
8488
splitList.add(section.toString());
8589
return splitList.toArray(new String[splitList.size()]);
86-
8790
}
8891
}

‎url-detector/src/main/java/com/linkedin/urls/detection/InputTextReader.java‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public InputTextReader(String content) {
5353
*/
5454
public char read() {
5555
char chr = _content[_index++];
56-
return (chr == '\n' || chr == '\t') ? ' ' : chr;
56+
return CharUtils.isWhiteSpace(chr) ? ' ' : chr;
5757
}
5858

5959
/**

‎url-detector/src/main/java/com/linkedin/urls/detection/UrlDetector.java‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,8 @@ private int processColon(int length) {
295295
length = _buffer.length(); //set length to be right after the scheme
296296
} else if (_buffer.length() > 0 && _options.hasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN)
297297
&& _reader.canReadChars(1)) { //takes care of case like hi:
298+
_reader.goBack(); //unread the ":" so readDomainName can take care of the port
299+
_buffer.delete(_buffer.length() - 1, _buffer.length());
298300
readDomainName(_buffer.toString());
299301
} else {
300302
readEnd(ReadEndState.InvalidUrl);

‎url-detector/src/main/java/com/linkedin/urls/url/HostNormalizer.java‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ private void normalizeHost() {
8181
return;
8282
}
8383

84-
host = UrlUtil.removeExtraDots(host).replace("\\x", "%");
84+
host = UrlUtil.removeExtraDots(host);
8585

86-
_normalizedHost = UrlUtil.encode(host);
86+
_normalizedHost = UrlUtil.encode(host).replace("\\x", "%");
8787
}
8888

8989
/**

‎url-detector/src/main/java/com/linkedin/urls/url/NormalizedUrl.java‎

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,8 @@ public NormalizedUrl(UrlMarker urlMarker) {
2525
/**
2626
* Returns a normalized url given a single url.
2727
*/
28-
public static NormalizedUrl createNormalized(String url)
29-
throws MalformedURLException {
30-
return normalize(create(url));
28+
public static NormalizedUrl create(String url) throws MalformedURLException {
29+
return normalize(Url.create(url));
3130
}
3231

3332
/**

‎url-detector/src/main/java/com/linkedin/urls/url/PathNormalizer.java‎

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
*/
1010
package com.linkedin.urls.url;
1111

12+
import java.util.Stack;
1213
import org.apache.commons.lang3.StringUtils;
1314

1415
class PathNormalizer {
@@ -23,31 +24,50 @@ protected String normalizePath(String path) {
2324
if (StringUtils.isEmpty(path)) {
2425
return path;
2526
}
26-
path = UrlUtil.removeSpecialSpaces(path);
2727
path = UrlUtil.decode(path);
2828
path = sanitizeDotsAndSlashes(path);
2929
return UrlUtil.encode(path);
3030
}
3131

3232
/**
33-
* Replaces "/../" and "/./" with "/" recursively.
33+
* 1. Replaces "/./" with "/" recursively.
34+
* 2. "/blah/asdf/.." -> "/blah"
35+
* 3. "/blah/blah2/blah3/../../blah4" -> "/blah/blah4"
36+
* 4. "//" -> "/"
37+
* 5. Adds a slash at the end if there isn't one
3438
*/
3539
private static String sanitizeDotsAndSlashes(String path) {
3640
StringBuilder stringBuilder = new StringBuilder(path);
41+
Stack<Integer> slashIndexStack = new Stack<Integer>();
3742
int index = 0;
38-
while (index < stringBuilder.length() - 2) {
39-
if ( stringBuilder.charAt(index) == '/' && stringBuilder.charAt(index + 1) == '.') {
40-
if (index + 3 < stringBuilder.length() && stringBuilder.charAt(index + 2) == '.' &&
41-
stringBuilder.charAt(index + 3) == '/') {
42-
stringBuilder.delete(index, index + 3); // "/../" -> "/"
43-
index--; // backtrack so we can detect if this / is part of another replacement
44-
} else if (stringBuilder.charAt(index + 2) == '/') {
45-
stringBuilder.delete(index, index + 2); // "/./" -> "/"
46-
index--; // backtrack so we can detect if this / is part of another replacement
43+
while (index < stringBuilder.length() - 1) {
44+
if (stringBuilder.charAt(index) == '/') {
45+
slashIndexStack.add(index);
46+
if (stringBuilder.charAt(index + 1) == '.') {
47+
if (index < stringBuilder.length() - 2) {
48+
if (stringBuilder.charAt(index + 2) == '.') {
49+
slashIndexStack.pop();
50+
int endIndex = index + 3;
51+
// backtrack so we can detect if this / is part of another replacement
52+
index = slashIndexStack.empty() ? 0 : slashIndexStack.pop();
53+
stringBuilder.delete(index, endIndex); // "/asdf/../" -> ""
54+
} else if (stringBuilder.charAt(index + 2) == '/') {
55+
stringBuilder.delete(index, index + 2); // "/./" -> "/"
56+
index--; // backtrack so we can detect if this / is part of another replacement
57+
}
58+
}
59+
} else if (stringBuilder.charAt(index + 1) == '/') {
60+
stringBuilder.deleteCharAt(index);
61+
index--;
4762
}
4863
}
4964
index++;
5065
}
66+
67+
if (stringBuilder.length() == 0) {
68+
stringBuilder.append("/"); //Every path has at least a slash
69+
}
70+
5171
return stringBuilder.toString();
5272
}
5373
}

‎url-detector/src/main/java/com/linkedin/urls/url/Url.java‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ protected Url(UrlMarker urlMarker) {
6161
*/
6262
public static Url create(String url)
6363
throws MalformedURLException {
64-
List<Url> urls = new UrlDetector(url, UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN).detect();
64+
String formattedString = UrlUtil.removeSpecialSpaces(url.trim().replace(" ", "%20"));
65+
List<Url> urls = new UrlDetector(formattedString, UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN).detect();
6566
if (urls.size() == 1) {
6667
return urls.get(0);
6768
} else if (urls.size() == 0) {

‎url-detector/src/main/java/com/linkedin/urls/url/UrlUtil.java‎

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,10 @@
1111

1212
import com.linkedin.urls.detection.CharUtils;
1313
import com.linkedin.urls.detection.InputTextReader;
14-
import java.net.IDN;
1514
import java.util.Stack;
1615

1716
class UrlUtil {
1817

19-
private static final char TAB = 0x09;
20-
private static final char CR = 0x0d;
21-
private static final char LF = 0x0a;
22-
2318
/**
2419
* Decodes the url by iteratively removing hex characters with backtracking.
2520
* For example: %2525252525252525 becomes %
@@ -65,7 +60,7 @@ protected static String removeSpecialSpaces(String urlPart) {
6560
StringBuilder stringBuilder = new StringBuilder(urlPart);
6661
for (int i = 0; i < stringBuilder.length(); i++) {
6762
char curr = stringBuilder.charAt(i);
68-
if (curr == TAB || curr == CR || curr == LF) {
63+
if (CharUtils.isWhiteSpace(curr)) {
6964
stringBuilder.deleteCharAt(i);
7065
}
7166
}

‎url-detector/src/test/java/com/linkedin/urls/url/TestNormalizedUrl.java‎

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,62 @@ private Object[][] getHostPathUrls() {
3434
@Test(dataProvider = "getHostPathUrls")
3535
public void testUsernamePasswordUrls(String testString, String host, String path)
3636
throws MalformedURLException {
37-
Url url = NormalizedUrl.createNormalized(testString);
37+
Url url = NormalizedUrl.create(testString);
3838
Assert.assertNotNull(url);
3939
Assert.assertEquals(url.getHost(), host);
4040
Assert.assertEquals(url.getPath(), path);
4141
}
42+
43+
44+
/**
45+
* https://developers.google.com/safe-browsing/developers_guide_v3#Canonicalization
46+
* @return
47+
*/
48+
@DataProvider
49+
private Object[][] getFullUrls() {
50+
return new Object[][] {
51+
{"http://host/%25%32%35", "http://host/%25"},
52+
{"http://host/%25%32%35%25%32%35", "http://host/%25%25"},
53+
{"http://host/%2525252525252525", "http://host/%25"},
54+
{"http://host/asdf%25%32%35asd", "http://host/asdf%25asd"},
55+
{"http://host/%%%25%32%35asd%%", "http://host/%25%25%25asd%25%25"},
56+
{"http://www.google.com/", "http://www.google.com/"},
57+
{"http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/",
58+
"http://168.188.99.26/.secure/www.ebay.com/"},
59+
{"http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/",
60+
"http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/"},
61+
{"http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B",
62+
"http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+"},
63+
{"http://3279880203/blah", "http://195.127.0.11/blah"},
64+
{"http://www.google.com/blah/..", "http://www.google.com/"},
65+
{"www.google.com/", "http://www.google.com/"},
66+
{"www.google.com", "http://www.google.com/"},
67+
{"http://www.evil.com/blah#frag", "http://www.evil.com/blah"},
68+
{"http://www.GOOgle.com/", "http://www.google.com/"},
69+
{"http://www.google.com/foo\tbar\rbaz\n2", "http://www.google.com/foobarbaz2"},
70+
{"http://www.google.com/q?", "http://www.google.com/q?"},
71+
{"http://www.google.com/q?r?", "http://www.google.com/q?r?"},
72+
{"http://www.google.com/q?r?s", "http://www.google.com/q?r?s"},
73+
{"http://evil.com/foo#bar#baz", "http://evil.com/foo"},
74+
{"http://evil.com/foo;", "http://evil.com/foo;"},
75+
{"http://evil.com/foo?bar;", "http://evil.com/foo?bar;"},
76+
{"http://\\x01\\x80.com/", "http://%01%80.com/"},
77+
{"http://notrailingslash.com", "http://notrailingslash.com/"},
78+
{"http://www.gotaport.com:1234/", "http://www.gotaport.com:1234/"},
79+
{" http://www.google.com/ ", "http://www.google.com/"},
80+
{"http:// leadingspace.com/", "http://%20leadingspace.com/"},
81+
{"http://%20leadingspace.com/", "http://%20leadingspace.com/"},
82+
{"%20leadingspace.com/", "http://%20leadingspace.com/"},
83+
{"https://www.securesite.com/", "https://www.securesite.com/"},
84+
{"http://host.com/ab%23cd", "http://host.com/ab%23cd"},
85+
{"http://host.com//twoslashes?more//slashes", "http://host.com/twoslashes?more//slashes"},
86+
{"http://go.co/a/b/../c", "http://go.co/a/c"}
87+
};
88+
}
89+
90+
@Test(dataProvider = "getFullUrls")
91+
public void testFullUrls(String host, String expectedHost) throws MalformedURLException {
92+
NormalizedUrl normalizedUrl = NormalizedUrl.create(host);
93+
Assert.assertEquals(normalizedUrl.getFullUrlWithoutFragment(), expectedHost);
94+
}
4295
}

‎url-detector/src/test/java/com/linkedin/urls/url/TestPathNormalizer.java‎

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
/**
22
* Copyright 2015 LinkedIn Corp. All rights reserved.
3-
* Licensed under the Apache License, Version 2.0 (the "License");you may not use this file except in compliance
4-
* with the License.You may obtain a copy of the License at  http://www.apache.org/licenses/LICENSE-2.0
3+
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
4+
* with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
55
6-
* Unless required by applicable law or agreed to in writing, softwaredistributed under the License is distributed
7-
* on an "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6+
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed
7+
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
88
*
99
*/
1010
package com.linkedin.urls.url;
@@ -30,12 +30,19 @@ private Object[][] getPaths() {
3030
"/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/"},
3131
{"/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B",
3232
"/~a!b@c%23d$e%25f^00&11*22(33)44_55+"},
33-
{"/lala/.././../..../","/lala/..../"}
33+
{"/lala/.././../..../", "/..../"},
34+
{"//asdfasdf/awef/sadf/sdf//", "/asdfasdf/awef/sadf/sdf/"},
35+
{"/", "/"},
36+
{"/a/../b/c", "/b/c"},
37+
{"/blah/..", "/"},
38+
{"../", "../"},
39+
{"/asdf/.", "/asdf/."}
40+
3441
};
3542
}
3643

3744
@Test(dataProvider = "getPaths")
38-
public void testSanityAddresses(String path, String expectedPath) {
45+
public void testPaths(String path, String expectedPath) {
3946
PathNormalizer pathNormalizer = new PathNormalizer();
4047

4148
Assert.assertEquals(pathNormalizer.normalizePath(path), expectedPath);

0 commit comments

Comments
 (0)