-
-
Notifications
You must be signed in to change notification settings - Fork 741
Fix Issue 18384 - std.net.isemail is slow to import due to regex #6129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,6 @@ module std.net.isemail; | |
|
||
// FIXME | ||
import std.range.primitives; // : ElementType; | ||
import std.regex; | ||
import std.traits; | ||
import std.typecons : Flag, Yes, No; | ||
|
||
|
@@ -74,10 +73,6 @@ if (isSomeChar!(Char)) | |
alias tstring = const(Char)[]; | ||
alias Token = TokenImpl!(Char); | ||
|
||
static ipRegex = ctRegex!(`\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}`~ | ||
`(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$`.to!(const(Char)[])); | ||
static fourChars = ctRegex!(`^[0-9A-Fa-f]{0,4}$`.to!(const(Char)[])); | ||
|
||
enum defaultThreshold = 16; | ||
int threshold; | ||
bool diagnose; | ||
|
@@ -399,12 +394,11 @@ if (isSomeChar!(Char)) | |
auto maxGroups = 8; | ||
size_t index = -1; | ||
auto addressLiteral = parseData[EmailPart.componentLiteral]; | ||
auto matchesIp = addressLiteral.matchAll(ipRegex).map!(a => a.hit).array; | ||
const(Char)[] ipSuffix = matchIPSuffix(addressLiteral); | ||
|
||
if (!matchesIp.empty) | ||
if (ipSuffix.length) | ||
{ | ||
index = addressLiteral.lastIndexOf(matchesIp.front); | ||
|
||
index = addressLiteral.length - ipSuffix.length; | ||
if (index != 0) | ||
addressLiteral = addressLiteral[0 .. index] ~ "0:0"; | ||
} | ||
|
@@ -418,7 +412,7 @@ if (isSomeChar!(Char)) | |
else | ||
{ | ||
auto ipV6 = addressLiteral[5 .. $]; | ||
matchesIp = ipV6.split(Token.colon); | ||
auto matchesIp = ipV6.split(Token.colon); | ||
immutable groupCount = matchesIp.length; | ||
index = ipV6.indexOf(Token.doubleColon); | ||
|
||
|
@@ -453,7 +447,7 @@ if (isSomeChar!(Char)) | |
returnStatus ~= EmailStatusCode.rfc5322IpV6ColonEnd; | ||
|
||
else if (!matchesIp | ||
.filter!(a => a.matchFirst(fourChars).empty) | ||
.filter!(a => !isUpToFourHexChars(a)) | ||
.empty) | ||
returnStatus ~= EmailStatusCode.rfc5322IpV6BadChar; | ||
|
||
|
@@ -1863,3 +1857,96 @@ const(T)[] get (T) (const(T)[] str, size_t index, dchar c) | |
assert("abc".get(1, 'b') == "b"); | ||
assert("löv".get(1, 'ö') == "ö"); | ||
} | ||
|
||
/+ | ||
Replacement for: | ||
--- | ||
static fourChars = ctRegex!(`^[0-9A-Fa-f]{0,4}$`.to!(const(Char)[])); | ||
... | ||
a => a.matchFirst(fourChars).empty | ||
--- | ||
+/ | ||
private bool isUpToFourHexChars(Char)(scope const(Char)[] s) | ||
{ | ||
import std.ascii : isHexDigit; | ||
if (s.length > 4) return false; | ||
foreach (c; s) | ||
if (!isHexDigit(c)) return false; | ||
return true; | ||
} | ||
|
||
version(StdUnittest) @nogc nothrow pure @safe unittest | ||
{ | ||
assert(!isUpToFourHexChars("12345")); | ||
assert(!isUpToFourHexChars("defg")); | ||
assert(isUpToFourHexChars("1A0a")); | ||
} | ||
|
||
/+ | ||
Replacement for: | ||
--- | ||
static ipRegex = ctRegex!(`\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}`~ | ||
`(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$`.to!(const(Char)[])); | ||
... | ||
auto matchesIp = addressLiteral.matchAll(ipRegex).map!(a => a.hit).array; | ||
---- | ||
Note that only the first item of "matchAll" was ever used in practice | ||
so we can return `const(Char)[]` instead of `const(Char)[][]` using a | ||
zero-length string to indicate no match. | ||
+/ | ||
private const(Char)[] matchIPSuffix(Char)(return const(Char)[] s) @nogc nothrow pure @safe | ||
{ | ||
size_t end = s.length; | ||
if (end < 7) return null; | ||
// Check the first three `[.]\d{1,3}` | ||
foreach (_; 0 .. 3) | ||
{ | ||
size_t start = void; | ||
if (end >= 2 && s[end-2] == '.') | ||
start = end - 2; | ||
else if (end >= 3 && s[end-3] == '.') | ||
start = end - 3; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is untested, try an IP address with 2 digits in an octet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
else if (end >= 4 && s[end-4] == '.') | ||
start = end - 4; | ||
else | ||
return null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about: start = end - s.byCodeUnit.retro.take(4).until(".").count; There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can do even one better: int blocks = s.byCodeUnit
.splitter(".")
.filter!checkValid // check an individual block
.take(4) // avoids problems with infinite strings
.walkLength;
if (blocks < 4) return null; There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't think that does the right thing. The blocks need to be consecutive and they need to be the tail of the string, and we need the total length of the tail. |
||
uint x = 0; | ||
foreach (i; start + 1 .. end) | ||
{ | ||
uint c = cast(uint) s[i] - '0'; | ||
if (c > 9) return null; | ||
x = x * 10 + c; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to check for integer overflow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't overflow, we're summing a maximum of 3 digits. |
||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. uint x = s[start + 1 .. end].byCodeUnit.take(2).to!uint.ifThrown(256); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI: Regarding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While making things There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about converting to FWIW, I like the idea of tagging internal functions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It probably should be but I'll leave that for another pull request. It looks like |
||
if (x > 255) return null; | ||
end = start; | ||
} | ||
// Check the final `\d{1,3}`. | ||
if (end < 1) return null; | ||
size_t start = end - 1; | ||
uint x = cast(uint) s[start] - '0'; | ||
if (x > 9) return null; | ||
if (start > 0 && cast(uint) s[start-1] - '0' <= 9) | ||
{ | ||
--start; | ||
x += 10 * (cast(uint) s[start] - '0'); | ||
if (start > 0 && cast(uint) s[start-1] - '0' <= 9) | ||
{ | ||
--start; | ||
x += 100 * (cast(uint) s[start] - '0'); | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s[max(0, end - 3) .. $].byCodeUnit.to!uint.ifThrown(256); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't use this here because unlike above I don't know where the encoded octet starts. |
||
if (x > 255) return null; | ||
// Must either be at start of string or preceded by a non-word character. | ||
// (TO DETERMINE: is the definition of "word character" ASCII only?) | ||
if (start == 0) return s; | ||
const b = s[start - 1]; | ||
import std.ascii : isAlphaNum; | ||
if (isAlphaNum(b) || b == '_') return null; | ||
return s[start .. $]; | ||
} | ||
|
||
version(StdUnittest) @nogc nothrow pure @safe unittest | ||
{ | ||
assert(matchIPSuffix("255.255.255.255") == "255.255.255.255"); | ||
assert(matchIPSuffix("babaev 176.16.0.1") == "176.16.0.1"); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return s.length <= 4 && s.all!isHexDigit;
But that would needstd.algorithm.searching
:)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, this would involve auto-decoding I think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
.byCodeUnit
- no autodecoding and in@nogc nothrow pure @safe
https://run.dlang.io/is/PvbTfz