Skip to content

Commit def0364

Browse files
authored
Merge pull request #678 from yarikoptic/no-hypothesis
RF+BF: Add tolerances and data types for nib-diff, remove hypothesis dependency
2 parents 7877add + 3636c4d commit def0364

File tree

7 files changed

+298
-99
lines changed

7 files changed

+298
-99
lines changed

.travis.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ cache:
1313
- $HOME/.cache/pip
1414
env:
1515
global:
16-
- DEPENDS="six numpy scipy matplotlib h5py pillow pydicom hypothesis"
16+
- DEPENDS="six numpy scipy matplotlib h5py pillow pydicom"
1717
- OPTIONAL_DEPENDS=""
1818
- INSTALL_TYPE="setup"
1919
- EXTRA_WHEELS="https://5cf40426d9f06eb7461d-6fe47d9331aba7cd62fc36c7196769e4.ssl.cf2.rackcdn.com"
@@ -97,7 +97,7 @@ before_install:
9797
- source venv/bin/activate
9898
- python --version # just to check
9999
- pip install -U pip wheel # needed at one point
100-
- retry pip install nose flake8 mock hypothesis # always
100+
- retry pip install nose flake8 mock # always
101101
- pip install $EXTRA_PIP_FLAGS $DEPENDS $OPTIONAL_DEPENDS
102102
- if [ "${COVERAGE}" == "1" ]; then
103103
pip install coverage;

appveyor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ install:
2222
- SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%
2323

2424
# Install the dependencies of the project.
25-
- pip install numpy scipy matplotlib nose h5py mock hypothesis pydicom
25+
- pip install numpy scipy matplotlib nose h5py mock pydicom
2626
- pip install .
2727
- SET NIBABEL_DATA_DIR=%CD%\nibabel-data
2828

dev-requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,3 @@
22
-r requirements.txt
33
nose
44
mock
5-
hypothesis

nibabel/cmdline/diff.py

Lines changed: 194 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -39,31 +39,78 @@ def get_opt_parser():
3939
Option("-H", "--header-fields",
4040
dest="header_fields", default='all',
4141
help="Header fields (comma separated) to be printed as well (if present)"),
42+
43+
Option("--ma", "--data-max-abs-diff",
44+
dest="data_max_abs_diff",
45+
type=float,
46+
default=0.0,
47+
help="Maximal absolute difference in data between files to tolerate."),
48+
49+
Option("--mr", "--data-max-rel-diff",
50+
dest="data_max_rel_diff",
51+
type=float,
52+
default=0.0,
53+
help="Maximal relative difference in data between files to tolerate."
54+
" If --data-max-abs-diff is also specified, only the data points "
55+
" with absolute difference greater than that value would be "
56+
" considered for relative difference check."),
57+
Option("--dt", "--datatype",
58+
dest="dtype",
59+
default=np.float64,
60+
help="Enter a numpy datatype such as 'float32'.")
4261
])
4362

4463
return p
4564

4665

4766
def are_values_different(*values):
48-
"""Generically compares values, returns true if different"""
49-
value0 = values[0]
50-
values = values[1:] # to ensure that the first value isn't compared with itself
51-
52-
for value in values:
53-
try: # we sometimes don't want NaN values
54-
if np.any(np.isnan(value0)) and np.any(np.isnan(value)): # if they're both NaN
55-
break
56-
elif np.any(np.isnan(value0)) or np.any(np.isnan(value)): # if only 1 is NaN
57-
return True
67+
"""Generically compare values, return True if different
5868
59-
except TypeError:
60-
pass
69+
Note that comparison is targetting reporting of comparison of the headers
70+
so has following specifics:
71+
- even a difference in data types is considered a difference, i.e. 1 != 1.0
72+
- nans are considered to be the "same", although generally nan != nan
73+
"""
74+
value0 = values[0]
6175

76+
# to not recompute over again
77+
if isinstance(value0, np.ndarray):
78+
try:
79+
# np.asarray for elderly numpys, e.g. 1.7.1 where for
80+
# degenerate arrays (shape ()) it would return a pure scalar
81+
value0_nans = np.asanyarray(np.isnan(value0))
82+
value0_nonnans = np.asanyarray(np.logical_not(value0_nans))
83+
# if value0_nans.size == 1:
84+
# import pdb; pdb.set_trace()
85+
if not np.any(value0_nans):
86+
value0_nans = None
87+
except TypeError as exc:
88+
str_exc = str(exc)
89+
# Not implemented in numpy 1.7.1
90+
if "not supported" in str_exc or "ot implemented" in str_exc:
91+
value0_nans = None
92+
else:
93+
raise
94+
95+
for value in values[1:]:
6296
if type(value0) != type(value): # if types are different, then we consider them different
6397
return True
6498
elif isinstance(value0, np.ndarray):
65-
return np.any(value0 != value)
66-
99+
if value0.dtype != value.dtype or \
100+
value0.shape != value.shape:
101+
return True
102+
# there might be nans and they need special treatment
103+
if value0_nans is not None:
104+
value_nans = np.isnan(value)
105+
if np.any(value0_nans != value_nans):
106+
return True
107+
if np.any(value0[value0_nonnans] != value[value0_nonnans]):
108+
return True
109+
elif np.any(value0 != value):
110+
return True
111+
elif value0 is np.nan:
112+
if value is not np.nan:
113+
return True
67114
elif value0 != value:
68115
return True
69116

@@ -101,8 +148,8 @@ def get_headers_diff(file_headers, names=None):
101148
return difference
102149

103150

104-
def get_data_diff(files):
105-
"""Get difference between md5 values
151+
def get_data_hash_diff(files, dtype=np.float64):
152+
"""Get difference between md5 values of data
106153
107154
Parameters
108155
----------
@@ -115,7 +162,7 @@ def get_data_diff(files):
115162
"""
116163

117164
md5sums = [
118-
hashlib.md5(np.ascontiguousarray(nib.load(f).get_data(), dtype=np.float32)).hexdigest()
165+
hashlib.md5(np.ascontiguousarray(nib.load(f).get_fdata(dtype=dtype))).hexdigest()
119166
for f in files
120167
]
121168

@@ -125,6 +172,86 @@ def get_data_diff(files):
125172
return md5sums
126173

127174

175+
def get_data_diff(files, max_abs=0, max_rel=0, dtype=np.float64):
176+
"""Get difference between data
177+
178+
Parameters
179+
----------
180+
files: list of (str or ndarray)
181+
If list of strings is provided -- they must be existing file names
182+
max_abs: float, optional
183+
Maximal absolute difference to tolerate.
184+
max_rel: float, optional
185+
Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate.
186+
If `max_abs` is specified, then those data points with lesser than that
187+
absolute difference, are not considered for relative difference testing
188+
dtype: np, optional
189+
Datatype to be used when extracting data from files
190+
191+
Returns
192+
-------
193+
diffs: OrderedDict
194+
An ordered dict with a record per each file which has differences
195+
with other files subsequent detected. Each record is a list of
196+
difference records, one per each file pair.
197+
Each difference record is an Ordered Dict with possible keys
198+
'abs' or 'rel' showing maximal absolute or relative differences
199+
in the file or the record ('CMP': 'incompat') if file shapes
200+
are incompatible.
201+
"""
202+
203+
# we are doomed to keep them in RAM now
204+
data = [f if isinstance(f, np.ndarray) else nib.load(f).get_fdata(dtype=dtype)
205+
for f in files]
206+
diffs = OrderedDict()
207+
for i, d1 in enumerate(data[:-1]):
208+
# populate empty entries for non-compared
209+
diffs1 = [None] * (i + 1)
210+
211+
for j, d2 in enumerate(data[i + 1:], i + 1):
212+
213+
if d1.shape == d2.shape:
214+
abs_diff = np.abs(d1 - d2)
215+
mean_abs = (np.abs(d1) + np.abs(d2)) * 0.5
216+
candidates = np.logical_or(mean_abs != 0, abs_diff != 0)
217+
218+
if max_abs:
219+
candidates[abs_diff <= max_abs] = False
220+
221+
max_abs_diff = np.max(abs_diff)
222+
if np.any(candidates):
223+
rel_diff = abs_diff[candidates] / mean_abs[candidates]
224+
if max_rel:
225+
sub_thr = rel_diff <= max_rel
226+
# Since we operated on sub-selected values already, we need
227+
# to plug them back in
228+
candidates[
229+
tuple((indexes[sub_thr] for indexes in np.where(candidates)))
230+
] = False
231+
max_rel_diff = np.max(rel_diff)
232+
else:
233+
max_rel_diff = 0
234+
235+
if np.any(candidates):
236+
237+
diff_rec = OrderedDict() # so that abs goes before relative
238+
239+
diff_rec['abs'] = max_abs_diff.astype(dtype)
240+
diff_rec['rel'] = max_rel_diff.astype(dtype)
241+
diffs1.append(diff_rec)
242+
else:
243+
diffs1.append(None)
244+
245+
else:
246+
diffs1.append({'CMP': "incompat"})
247+
248+
if any(diffs1):
249+
250+
diffs['DATA(diff %d:)' % (i + 1)] = diffs1
251+
252+
return diffs
253+
254+
128255
def display_diff(files, diff):
129256
"""Format header differences into a nice string
130257
@@ -140,21 +267,27 @@ def display_diff(files, diff):
140267
"""
141268
output = ""
142269
field_width = "{:<15}"
270+
filename_width = "{:<53}"
143271
value_width = "{:<55}"
144272

145273
output += "These files are different.\n"
146-
output += field_width.format('Field')
274+
output += field_width.format('Field/File')
147275

148-
for f in files:
149-
output += value_width.format(os.path.basename(f))
276+
for i, f in enumerate(files, 1):
277+
output += "%d:%s" % (i, filename_width.format(os.path.basename(f)))
150278

151279
output += "\n"
152280

153281
for key, value in diff.items():
154282
output += field_width.format(key)
155283

156284
for item in value:
157-
item_str = str(item)
285+
if isinstance(item, dict):
286+
item_str = ', '.join('%s: %s' % i for i in item.items())
287+
elif item is None:
288+
item_str = '-'
289+
else:
290+
item_str = str(item)
158291
# Value might start/end with some invisible spacing characters so we
159292
# would "condition" it on both ends a bit
160293
item_str = re.sub('^[ \t]+', '<', item_str)
@@ -169,8 +302,39 @@ def display_diff(files, diff):
169302
return output
170303

171304

305+
def diff(files, header_fields='all', data_max_abs_diff=None, data_max_rel_diff=None,
306+
dtype=np.float64):
307+
assert len(files) >= 2, "Please enter at least two files"
308+
309+
file_headers = [nib.load(f).header for f in files]
310+
311+
# signals "all fields"
312+
if header_fields == 'all':
313+
# TODO: header fields might vary across file types, thus prior sensing would be needed
314+
header_fields = file_headers[0].keys()
315+
else:
316+
header_fields = header_fields.split(',')
317+
318+
diff = get_headers_diff(file_headers, header_fields)
319+
320+
data_md5_diffs = get_data_hash_diff(files, dtype)
321+
if data_md5_diffs:
322+
# provide details, possibly triggering the ignore of the difference
323+
# in data
324+
data_diffs = get_data_diff(files,
325+
max_abs=data_max_abs_diff,
326+
max_rel=data_max_rel_diff,
327+
dtype=dtype)
328+
if data_diffs:
329+
diff['DATA(md5)'] = data_md5_diffs
330+
diff.update(data_diffs)
331+
332+
return diff
333+
334+
172335
def main(args=None, out=None):
173336
"""Getting the show on the road"""
337+
174338
out = out or sys.stdout
175339
parser = get_opt_parser()
176340
(opts, files) = parser.parse_args(args)
@@ -181,27 +345,17 @@ def main(args=None, out=None):
181345
# suppress nibabel format-compliance warnings
182346
nib.imageglobals.logger.level = 50
183347

184-
assert len(files) >= 2, "Please enter at least two files"
185-
186-
file_headers = [nib.load(f).header for f in files]
187-
188-
# signals "all fields"
189-
if opts.header_fields == 'all':
190-
# TODO: header fields might vary across file types, thus prior sensing would be needed
191-
header_fields = file_headers[0].keys()
192-
else:
193-
header_fields = opts.header_fields.split(',')
194-
195-
diff = get_headers_diff(file_headers, header_fields)
196-
data_diff = get_data_diff(files)
197-
198-
if data_diff:
199-
diff['DATA(md5)'] = data_diff
348+
files_diff = diff(
349+
files,
350+
header_fields=opts.header_fields,
351+
data_max_abs_diff=opts.data_max_abs_diff,
352+
data_max_rel_diff=opts.data_max_rel_diff,
353+
dtype=opts.dtype
354+
)
200355

201-
if diff:
202-
out.write(display_diff(files, diff))
356+
if files_diff:
357+
out.write(display_diff(files, files_diff))
203358
raise SystemExit(1)
204-
205359
else:
206360
out.write("These files are identical.\n")
207361
raise SystemExit(0)

0 commit comments

Comments
 (0)