Skip to content

Commit 01dd303

Browse files
authored
Optimize validate (#60)
Validation can now either fail fast (raise_ex=False) or fail with explanation (raise_ex=True). Improve validation performance by using fail fast when checking unions. Optimize record validation by performing explicit check for "class" first. Improve validation error reporting for unions when one of the types is an exact match for class by suppressing errors for other types in the union. Other changes: * Use unicode strings more consistently. * makedoc applies table styling * Add version constraint to lockfile package dependency
1 parent 423e48b commit 01dd303

File tree

4 files changed

+165
-71
lines changed

4 files changed

+165
-71
lines changed

schema_salad/makedoc.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ def __init__(self): # type: () -> None
4949
def header(self, text, level, raw=None):
5050
return """<h%i id="%s">%s</h%i>""" % (level, to_id(text), text, level)
5151

52+
def table(self, header, body):
53+
return (
54+
'<table class="table table-striped">\n<thead>%s</thead>\n'
55+
'<tbody>\n%s</tbody>\n</table>\n'
56+
) % (header, body)
57+
5258

5359
def to_id(text): # type: (Union[str, unicode]) -> Union[str, unicode]
5460
textid = text

schema_salad/schema.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -233,35 +233,49 @@ def validate_doc(schema_names, doc, loader, strict):
233233
else:
234234
raise validate.ValidationException("Document must be dict or list")
235235

236+
roots = []
237+
for r in schema_names.names.values():
238+
if ((hasattr(r, "get_prop") and r.get_prop(u"documentRoot")) or (
239+
r.props.get(u"documentRoot"))):
240+
roots.append(r)
241+
236242
anyerrors = []
237243
for pos, item in enumerate(validate_doc):
238-
errors = []
239244
success = False
240-
for r in schema_names.names.values():
241-
if ((hasattr(r, "get_prop") and r.get_prop(u"documentRoot")) or (
242-
u"documentRoot" in r.props)):
245+
for r in roots:
246+
success = validate.validate_ex(
247+
r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=False)
248+
if success:
249+
break
250+
251+
if not success:
252+
errors = [] # type: List[unicode]
253+
for r in roots:
254+
if hasattr(r, "get_prop"):
255+
name = r.get_prop(u"name")
256+
elif hasattr(r, "name"):
257+
name = r.name
258+
243259
try:
244260
validate.validate_ex(
245-
r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties)
246-
success = True
261+
r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=True)
262+
except validate.ClassValidationException as e:
263+
errors = [u"Could not validate `%s` because\n%s" % (
264+
name, validate.indent(str(e), nolead=False))]
247265
break
248266
except validate.ValidationException as e:
249-
if hasattr(r, "get_prop"):
250-
name = r.get_prop(u"name")
251-
elif hasattr(r, "name"):
252-
name = r.name
253-
errors.append("Could not validate as `%s` because\n%s" % (
267+
errors.append(u"Could not validate as `%s` because\n%s" % (
254268
name, validate.indent(str(e), nolead=False)))
255-
if not success:
256-
objerr = "Validation error at position %i" % pos
269+
270+
objerr = u"Validation error at position %i" % pos
257271
for ident in loader.identifiers:
258272
if ident in item:
259-
objerr = "Validation error in object %s" % (item[ident])
273+
objerr = u"Validation error in object %s" % (item[ident])
260274
break
261-
anyerrors.append("%s\n%s" %
262-
(objerr, validate.indent("\n".join(errors))))
275+
anyerrors.append(u"%s\n%s" %
276+
(objerr, validate.indent(u"\n".join(errors))))
263277
if anyerrors:
264-
raise validate.ValidationException("\n".join(anyerrors))
278+
raise validate.ValidationException(u"\n".join(anyerrors))
265279

266280

267281
def replace_type(items, spec, loader, found):

schema_salad/validate.py

Lines changed: 126 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,28 @@
22
import avro.schema
33
import sys
44
import urlparse
5-
from typing import Any
5+
from typing import Any, Union
66

77
class ValidationException(Exception):
88
pass
99

10+
class ClassValidationException(ValidationException):
11+
pass
12+
1013
def validate(expected_schema, datum, identifiers=set(), strict=False, foreign_properties=set()):
1114
# type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode]) -> bool
12-
try:
13-
return validate_ex(expected_schema, datum, identifiers, strict=strict, foreign_properties=foreign_properties)
14-
except ValidationException:
15-
return False
15+
return validate_ex(expected_schema, datum, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=False)
1616

1717
INT_MIN_VALUE = -(1 << 31)
1818
INT_MAX_VALUE = (1 << 31) - 1
1919
LONG_MIN_VALUE = -(1 << 63)
2020
LONG_MAX_VALUE = (1 << 63) - 1
2121

22-
def indent(v, nolead=False): # type: (str, bool) -> str
22+
def indent(v, nolead=False): # type: (Union[str, unicode], bool) -> unicode
2323
if nolead:
24-
return v.splitlines()[0] + "\n".join([" " + l for l in v.splitlines()[1:]])
24+
return v.splitlines()[0] + u"\n".join([u" " + l for l in v.splitlines()[1:]])
2525
else:
26-
return "\n".join([" " + l for l in v.splitlines()])
26+
return u"\n".join([" " + l for l in v.splitlines()])
2727

2828
def friendly(v): # type: (Any) -> Any
2929
if isinstance(v, avro.schema.NamedSchema):
@@ -37,11 +37,11 @@ def friendly(v): # type: (Any) -> Any
3737
else:
3838
return v
3939

40-
def multi(v, q=""): # type: (str, str) -> str
40+
def multi(v, q=""): # type: (Union[str, unicode], Union[str, unicode]) -> unicode
4141
if '\n' in v:
42-
return "%s%s%s\n" % (q, v, q)
42+
return u"%s%s%s\n" % (q, v, q)
4343
else:
44-
return "%s%s%s" % (q, v, q)
44+
return u"%s%s%s" % (q, v, q)
4545

4646
def vpformat(datum): # type: (Any) -> str
4747
a = pprint.pformat(datum)
@@ -50,8 +50,8 @@ def vpformat(datum): # type: (Any) -> str
5050
return a
5151

5252
def validate_ex(expected_schema, datum, identifiers=None, strict=False,
53-
foreign_properties=None):
54-
# type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode]) -> bool
53+
foreign_properties=None, raise_ex=True):
54+
# type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode], bool) -> bool
5555
"""Determine if a python datum is an instance of a schema."""
5656

5757
if not identifiers:
@@ -66,93 +66,154 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False,
6666
if datum is None:
6767
return True
6868
else:
69-
raise ValidationException("the value `%s` is not null" % vpformat(datum))
69+
if raise_ex:
70+
raise ValidationException(u"the value `%s` is not null" % vpformat(datum))
71+
else:
72+
return False
7073
elif schema_type == 'boolean':
7174
if isinstance(datum, bool):
7275
return True
7376
else:
74-
raise ValidationException("the value `%s` is not boolean" % vpformat(datum))
77+
if raise_ex:
78+
raise ValidationException(u"the value `%s` is not boolean" % vpformat(datum))
79+
else:
80+
return False
7581
elif schema_type == 'string':
7682
if isinstance(datum, basestring):
7783
return True
7884
elif isinstance(datum, bytes):
79-
datum = datum.decode("utf-8")
85+
datum = datum.decode(u"utf-8")
8086
return True
8187
else:
82-
raise ValidationException("the value `%s` is not string" % vpformat(datum))
88+
if raise_ex:
89+
raise ValidationException(u"the value `%s` is not string" % vpformat(datum))
90+
else:
91+
return False
8392
elif schema_type == 'bytes':
8493
if isinstance(datum, str):
8594
return True
8695
else:
87-
raise ValidationException("the value `%s` is not bytes" % vpformat(datum))
96+
if raise_ex:
97+
raise ValidationException(u"the value `%s` is not bytes" % vpformat(datum))
98+
else:
99+
return False
88100
elif schema_type == 'int':
89101
if ((isinstance(datum, int) or isinstance(datum, long))
90102
and INT_MIN_VALUE <= datum <= INT_MAX_VALUE):
91103
return True
92104
else:
93-
raise ValidationException("`%s` is not int" % vpformat(datum))
105+
if raise_ex:
106+
raise ValidationException(u"`%s` is not int" % vpformat(datum))
107+
else:
108+
return False
94109
elif schema_type == 'long':
95110
if ((isinstance(datum, int) or isinstance(datum, long))
96111
and LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE):
97112
return True
98113
else:
99-
raise ValidationException("the value `%s` is not long" % vpformat(datum))
114+
if raise_ex:
115+
raise ValidationException(u"the value `%s` is not long" % vpformat(datum))
116+
else:
117+
return False
100118
elif schema_type in ['float', 'double']:
101119
if (isinstance(datum, int) or isinstance(datum, long)
102120
or isinstance(datum, float)):
103121
return True
104122
else:
105-
raise ValidationException("the value `%s` is not float or double" % vpformat(datum))
123+
if raise_ex:
124+
raise ValidationException(u"the value `%s` is not float or double" % vpformat(datum))
125+
else:
126+
return False
106127
elif isinstance(expected_schema, avro.schema.FixedSchema):
107128
if isinstance(datum, str) and len(datum) == expected_schema.size:
108129
return True
109130
else:
110-
raise ValidationException("the value `%s` is not fixed" % vpformat(datum))
131+
if raise_ex:
132+
raise ValidationException(u"the value `%s` is not fixed" % vpformat(datum))
133+
else:
134+
return False
111135
elif isinstance(expected_schema, avro.schema.EnumSchema):
112136
if expected_schema.name == "Any":
113137
if datum is not None:
114138
return True
115139
else:
116-
raise ValidationException("Any type must be non-null")
140+
if raise_ex:
141+
raise ValidationException(u"'Any' type must be non-null")
142+
else:
143+
return False
117144
if datum in expected_schema.symbols:
118145
return True
119146
else:
120-
raise ValidationException("the value `%s`\n is not a valid symbol in enum %s, expected one of %s" % (vpformat(datum), expected_schema.name, "'" + "', '".join(expected_schema.symbols) + "'"))
147+
if raise_ex:
148+
raise ValidationException(u"the value `%s`\n is not a valid symbol in enum %s, expected one of %s" % (vpformat(datum), expected_schema.name, "'" + "', '".join(expected_schema.symbols) + "'"))
149+
else:
150+
return False
121151
elif isinstance(expected_schema, avro.schema.ArraySchema):
122152
if isinstance(datum, list):
123153
for i, d in enumerate(datum):
124154
try:
125-
validate_ex(expected_schema.items, d, identifiers, strict=strict, foreign_properties=foreign_properties)
155+
if not validate_ex(expected_schema.items, d, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex):
156+
return False
126157
except ValidationException as v:
127-
raise ValidationException("At position %i\n%s" % (i, indent(str(v))))
128-
return True
129-
else:
130-
raise ValidationException("the value `%s` is not a list, expected list of %s" % (vpformat(datum), friendly(expected_schema.items)))
131-
elif isinstance(expected_schema, avro.schema.MapSchema):
132-
if (isinstance(datum, dict) and
133-
False not in [isinstance(k, basestring) for k in datum.keys()] and
134-
False not in [validate(expected_schema.values, v, strict=strict) for v in datum.values()]):
158+
if raise_ex:
159+
raise ValidationException(u"At position %i\n%s" % (i, indent(str(v))))
160+
else:
161+
return False
135162
return True
136163
else:
137-
raise ValidationException("`%s` is not a valid map value, expected\n %s" % (vpformat(datum), vpformat(expected_schema.values)))
164+
if raise_ex:
165+
raise ValidationException(u"the value `%s` is not a list, expected list of %s" % (vpformat(datum), friendly(expected_schema.items)))
166+
else:
167+
return False
138168
elif isinstance(expected_schema, avro.schema.UnionSchema):
139-
if True in [validate(s, datum, identifiers, strict=strict) for s in expected_schema.schemas]:
140-
return True
141-
else:
142-
errors = []
143-
for s in expected_schema.schemas:
144-
try:
145-
validate_ex(s, datum, identifiers, strict=strict, foreign_properties=foreign_properties)
146-
except ValidationException as e:
147-
errors.append(str(e))
148-
raise ValidationException("the value %s is not a valid type in the union, expected one of:\n%s" % (multi(vpformat(datum), '`'), "\n".join(["- %s, but\n %s" % (friendly(expected_schema.schemas[i]), indent(multi(errors[i]))) for i in range(0, len(expected_schema.schemas))])))
169+
for s in expected_schema.schemas:
170+
if validate_ex(s, datum, identifiers, strict=strict, raise_ex=False):
171+
return True
172+
173+
if not raise_ex:
174+
return False
175+
176+
errors = [] # type: List[unicode]
177+
for s in expected_schema.schemas:
178+
try:
179+
validate_ex(s, datum, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=True)
180+
except ClassValidationException as e:
181+
raise
182+
except ValidationException as e:
183+
errors.append(unicode(e))
184+
185+
raise ValidationException(u"the value %s is not a valid type in the union, expected one of:\n%s" % (
186+
multi(vpformat(datum), '`'), u"\n".join([
187+
u"- %s, but\n %s" % (
188+
friendly(expected_schema.schemas[i]), indent(multi(errors[i])))
189+
for i in range(0, len(expected_schema.schemas))])))
149190

150191
elif isinstance(expected_schema, avro.schema.RecordSchema):
151192
if not isinstance(datum, dict):
152-
raise ValidationException("`%s`\n is not a dict" % vpformat(datum))
193+
if raise_ex:
194+
raise ValidationException(u"`%s`\n is not a dict" % vpformat(datum))
195+
else:
196+
return False
197+
198+
classmatch = None
199+
for f in expected_schema.fields:
200+
if f.name == "class":
201+
d = datum.get("class")
202+
if not d:
203+
if raise_ex:
204+
raise ValidationException(u"Missing 'class' field")
205+
else:
206+
return False
207+
if not validate_ex(f.type, d, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex):
208+
return False
209+
classmatch = d
210+
break
153211

154212
errors = []
155213
for f in expected_schema.fields:
214+
if f.name == "class":
215+
continue
216+
156217
if f.name in datum:
157218
fieldval = datum[f.name]
158219
else:
@@ -162,12 +223,14 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False,
162223
fieldval = None
163224

164225
try:
165-
validate_ex(f.type, fieldval, identifiers, strict=strict, foreign_properties=foreign_properties)
226+
if not validate_ex(f.type, fieldval, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex):
227+
return False
166228
except ValidationException as v:
167229
if f.name not in datum:
168-
errors.append("missing required field `%s`" % f.name)
230+
errors.append(u"missing required field `%s`" % f.name)
169231
else:
170-
errors.append("could not validate field `%s` because\n%s" % (f.name, multi(indent(str(v)))))
232+
errors.append(u"could not validate field `%s` because\n%s" % (f.name, multi(indent(str(v)))))
233+
171234
if strict:
172235
for d in datum:
173236
found = False
@@ -176,14 +239,25 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False,
176239
found = True
177240
if not found:
178241
if d not in identifiers and d not in foreign_properties and d[0] not in ("@", "$"):
242+
if not raise_ex:
243+
return False
179244
split = urlparse.urlsplit(d)
180245
if split.scheme:
181-
errors.append("could not validate extension field `%s` because it is not recognized and strict is True. Did you include a $schemas section?" % (d))
246+
errors.append(u"could not validate extension field `%s` because it is not recognized and strict is True. Did you include a $schemas section?" % (d))
182247
else:
183-
errors.append("could not validate field `%s` because it is not recognized and strict is True, valid fields are: %s" % (d, ", ".join(fn.name for fn in expected_schema.fields)))
248+
errors.append(u"could not validate field `%s` because it is not recognized and strict is True, valid fields are: %s" % (d, ", ".join(fn.name for fn in expected_schema.fields)))
184249

185250
if errors:
186-
raise ValidationException("\n".join(errors))
251+
if raise_ex:
252+
if classmatch:
253+
raise ClassValidationException(u"%s record %s" % (classmatch, "\n".join(errors)))
254+
else:
255+
raise ValidationException(u"\n".join(errors))
256+
else:
257+
return False
187258
else:
188259
return True
189-
raise ValidationException("Unrecognized schema_type %s" % schema_type)
260+
if raise_ex:
261+
raise ValidationException(u"Unrecognized schema_type %s" % schema_type)
262+
else:
263+
return False

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
'mistune',
3636
'typing >= 3.5.2',
3737
'CacheControl',
38-
'lockfile']
38+
'lockfile >= 0.9']
3939

4040
install_requires.append("avro") # TODO: remove me once cwltool is
4141
# available in Debian Stable, Ubuntu 12.04 LTS
@@ -46,7 +46,7 @@
4646
extras_require = {} # TODO: to be removed when the above is added
4747

4848
setup(name='schema-salad',
49-
version='1.17',
49+
version='1.18',
5050
description='Schema Annotations for Linked Avro Data (SALAD)',
5151
long_description=open(README).read(),
5252
author='Common workflow language working group',

0 commit comments

Comments
 (0)