From 37946e542ad4c8ee2891a269cdd7028b085acdd0 Mon Sep 17 00:00:00 2001 From: Vincent Maurin Date: Fri, 25 Jun 2021 11:06:03 +0200 Subject: [PATCH] Simplify the uri format validation regexp The previous regexp was relying a lot on `|` leading to stackoverflow errors for long URIs (as Java Pattern class is using recursion a lot). I have reworked the regexp from the RFC directly, taking some shortcut to mostly validate what are the allowed or not allowed char in the different part of the URI refs https://github.com/networknt/json-schema-validator/issues/413 --- .../com/networknt/schema/JsonMetaSchema.java | 37 ++++++++++++++++++- .../resources/draft4/optional/format.json | 25 +++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/networknt/schema/JsonMetaSchema.java b/src/main/java/com/networknt/schema/JsonMetaSchema.java index 479155eab..ca90cb330 100644 --- a/src/main/java/com/networknt/schema/JsonMetaSchema.java +++ b/src/main/java/com/networknt/schema/JsonMetaSchema.java @@ -47,7 +47,42 @@ static PatternFormat pattern(String name, String regex) { "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$")); COMMON_BUILTIN_FORMATS.add(pattern("ipv6", "^\\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:)))(%.+)?\\s*$")); - COMMON_BUILTIN_FORMATS.add(pattern("uri", "^([A-Za-z][A-Za-z0-9+.-]+):(\\/\\/([^@]+@)?([A-Za-z0-9.\\-_~]+)(:\\d+)?)?((?:[A-Za-z0-9-._~]|%[A-Fa-f0-9]|[!$&'()*+,;=:@])+(?:\\/(?:[A-Za-z0-9-._~]|%[A-Fa-f0-9]|[!$&'()*+,;=:@])*)*|(?:\\/(?:[A-Za-z0-9-._~]|%[A-Fa-f0-9]|[!$&'()*+,;=:@])+)*)?(\\?(?:[A-Za-z0-9-._~]|%[A-Fa-f0-9]|[!$&'()*+,;=:@]|[/?])*)?(\\#(?:[A-Za-z0-9-._~]|%[A-Fa-f0-9]|[!$&'()*+,;=:@]|[/?])*)?$")); + + // From RFC 3986 + // ALPHA [A-Za-z] + // DIGIT [0-9] + // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + // => [A-Za-z][A-Za-z0-9+.-]* + // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + // => [A-Za-z0-9._~\-] + // gen-delims [:/?#\[\]@] + // sub-delims [!$&'()*+,;=] + // reserved = = gen-delims / sub-delims + // => [:/?#\[\]@!$&'()*+,;=] + // pct-encoded = "%" HEXDIG HEXDIG + // => [A-Za-z0-9%] (approximation) + // pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + // => [A-Za-z0-9._~\-%!$&'()*+,;=:@] + // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + // => [A-Za-z0-9._~\-%!$&'()*+,;=:]* + // host = IP-literal / IPv4address / reg-name + // => [A-Za-z0-9._~\-!$&'()*+,;=%:\[\]]* (approximation) + // port = *DIGiT + // => [0-9]* + // authority = [ userinfo "@" ] host [ ":" port ] + // => ([A-Za-z0-9._~\-%!$&'()*+,;=:]*@)?[A-Za-z0-9._~\-!$&'()*+,;=%:\[\]]*(:[0-9]*)? + // hier-part = "//" authority path-abempty + // / path-absolute + // / path-rootless + // / path-empty + // => (\/\/([A-Za-z0-9._~\-%!$&'()*+,;=:]*@)?[A-Za-z0-9._~\-!$&'()*+,;=%:\[\]]*(:[0-9]*)?)?[A-Za-z0-9._~\-%!$&'()*+,;=:@\/]* (approximation) + // query = *( pchar / "/" / "?" ) + // => [A-Za-z0-9._~\-%!$&'()*+,;=:@\/?]* + // fragment = *( pchar / "/" / "?" ) + // => [A-Za-z0-9._~\-%!$&'()*+,;=:@\/?]* + // uri = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + COMMON_BUILTIN_FORMATS.add(pattern("uri", + "^[A-Za-z][A-Za-z0-9+.-]*:(\\/\\/([A-Za-z0-9._~\\-%!$&'()*+,;=:]*@)?[A-Za-z0-9._~\\-!$&'()*+,;=%:\\[\\]]*(:[0-9]*)?)?[A-Za-z0-9._~\\-%!$&'()*+,;=:@\\/]*([?][A-Za-z0-9._~\\-%!$&'()*+,;=:@\\/?]*)?([#][A-Za-z0-9._~\\-%!$&'()*+,;=:@\\/?]*)?")); COMMON_BUILTIN_FORMATS.add(pattern("color", "(#?([0-9A-Fa-f]{3,6})\\b)|(aqua)|(black)|(blue)|(fuchsia)|(gray)|(green)|(lime)|(maroon)|(navy)|(olive)|(orange)|(purple)|(red)|(silver)|(teal)|(white)|(yellow)|(rgb\\(\\s*\\b([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\\b\\s*,\\s*\\b([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\\b\\s*,\\s*\\b([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\\b\\s*\\))|(rgb\\(\\s*(\\d?\\d%|100%)+\\s*,\\s*(\\d?\\d%|100%)+\\s*,\\s*(\\d?\\d%|100%)+\\s*\\))")); COMMON_BUILTIN_FORMATS.add(pattern("hostname", diff --git a/src/test/resources/draft4/optional/format.json b/src/test/resources/draft4/optional/format.json index d22add103..4a1328d7d 100644 --- a/src/test/resources/draft4/optional/format.json +++ b/src/test/resources/draft4/optional/format.json @@ -211,6 +211,31 @@ "data": "http://json-schema.org/?#", "valid": true }, + { + "description": "One letter scheme", + "data": "a://foo", + "valid": true + }, + { + "description": "very long valid URI", + "data": "http://foo.bar/?baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a#&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a&baz=a%20a", + "valid": true + }, + { + "description": "authority with user", + "data": "https://foo@bar.org", + "valid": true + }, + { + "description": "authority with port", + "data": "https://foo.org:12345", + "valid": true + }, + { + "description": "authority with user and port", + "data": "https://foo@bar.org:12345", + "valid": true + }, { "description": "a scheme is mandatory in URI", "data": "//foo.bar/?baz=qux#quux",