@@ -39,31 +39,78 @@ def get_opt_parser():
39
39
Option ("-H" , "--header-fields" ,
40
40
dest = "header_fields" , default = 'all' ,
41
41
help = "Header fields (comma separated) to be printed as well (if present)" ),
42
+
43
+ Option ("--ma" , "--data-max-abs-diff" ,
44
+ dest = "data_max_abs_diff" ,
45
+ type = float ,
46
+ default = 0.0 ,
47
+ help = "Maximal absolute difference in data between files to tolerate." ),
48
+
49
+ Option ("--mr" , "--data-max-rel-diff" ,
50
+ dest = "data_max_rel_diff" ,
51
+ type = float ,
52
+ default = 0.0 ,
53
+ help = "Maximal relative difference in data between files to tolerate."
54
+ " If --data-max-abs-diff is also specified, only the data points "
55
+ " with absolute difference greater than that value would be "
56
+ " considered for relative difference check." ),
57
+ Option ("--dt" , "--datatype" ,
58
+ dest = "dtype" ,
59
+ default = np .float64 ,
60
+ help = "Enter a numpy datatype such as 'float32'." )
42
61
])
43
62
44
63
return p
45
64
46
65
47
66
def are_values_different (* values ):
48
- """Generically compares values, returns true if different"""
49
- value0 = values [0 ]
50
- values = values [1 :] # to ensure that the first value isn't compared with itself
51
-
52
- for value in values :
53
- try : # we sometimes don't want NaN values
54
- if np .any (np .isnan (value0 )) and np .any (np .isnan (value )): # if they're both NaN
55
- break
56
- elif np .any (np .isnan (value0 )) or np .any (np .isnan (value )): # if only 1 is NaN
57
- return True
67
+ """Generically compare values, return True if different
58
68
59
- except TypeError :
60
- pass
69
+ Note that comparison is targetting reporting of comparison of the headers
70
+ so has following specifics:
71
+ - even a difference in data types is considered a difference, i.e. 1 != 1.0
72
+ - nans are considered to be the "same", although generally nan != nan
73
+ """
74
+ value0 = values [0 ]
61
75
76
+ # to not recompute over again
77
+ if isinstance (value0 , np .ndarray ):
78
+ try :
79
+ # np.asarray for elderly numpys, e.g. 1.7.1 where for
80
+ # degenerate arrays (shape ()) it would return a pure scalar
81
+ value0_nans = np .asanyarray (np .isnan (value0 ))
82
+ value0_nonnans = np .asanyarray (np .logical_not (value0_nans ))
83
+ # if value0_nans.size == 1:
84
+ # import pdb; pdb.set_trace()
85
+ if not np .any (value0_nans ):
86
+ value0_nans = None
87
+ except TypeError as exc :
88
+ str_exc = str (exc )
89
+ # Not implemented in numpy 1.7.1
90
+ if "not supported" in str_exc or "ot implemented" in str_exc :
91
+ value0_nans = None
92
+ else :
93
+ raise
94
+
95
+ for value in values [1 :]:
62
96
if type (value0 ) != type (value ): # if types are different, then we consider them different
63
97
return True
64
98
elif isinstance (value0 , np .ndarray ):
65
- return np .any (value0 != value )
66
-
99
+ if value0 .dtype != value .dtype or \
100
+ value0 .shape != value .shape :
101
+ return True
102
+ # there might be nans and they need special treatment
103
+ if value0_nans is not None :
104
+ value_nans = np .isnan (value )
105
+ if np .any (value0_nans != value_nans ):
106
+ return True
107
+ if np .any (value0 [value0_nonnans ] != value [value0_nonnans ]):
108
+ return True
109
+ elif np .any (value0 != value ):
110
+ return True
111
+ elif value0 is np .nan :
112
+ if value is not np .nan :
113
+ return True
67
114
elif value0 != value :
68
115
return True
69
116
@@ -101,8 +148,8 @@ def get_headers_diff(file_headers, names=None):
101
148
return difference
102
149
103
150
104
- def get_data_diff (files ):
105
- """Get difference between md5 values
151
+ def get_data_hash_diff (files , dtype = np . float64 ):
152
+ """Get difference between md5 values of data
106
153
107
154
Parameters
108
155
----------
@@ -115,7 +162,7 @@ def get_data_diff(files):
115
162
"""
116
163
117
164
md5sums = [
118
- hashlib .md5 (np .ascontiguousarray (nib .load (f ).get_data (), dtype = np . float32 )).hexdigest ()
165
+ hashlib .md5 (np .ascontiguousarray (nib .load (f ).get_fdata ( dtype = dtype ) )).hexdigest ()
119
166
for f in files
120
167
]
121
168
@@ -125,6 +172,86 @@ def get_data_diff(files):
125
172
return md5sums
126
173
127
174
175
+ def get_data_diff (files , max_abs = 0 , max_rel = 0 , dtype = np .float64 ):
176
+ """Get difference between data
177
+
178
+ Parameters
179
+ ----------
180
+ files: list of (str or ndarray)
181
+ If list of strings is provided -- they must be existing file names
182
+ max_abs: float, optional
183
+ Maximal absolute difference to tolerate.
184
+ max_rel: float, optional
185
+ Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate.
186
+ If `max_abs` is specified, then those data points with lesser than that
187
+ absolute difference, are not considered for relative difference testing
188
+ dtype: np, optional
189
+ Datatype to be used when extracting data from files
190
+
191
+ Returns
192
+ -------
193
+ diffs: OrderedDict
194
+ An ordered dict with a record per each file which has differences
195
+ with other files subsequent detected. Each record is a list of
196
+ difference records, one per each file pair.
197
+ Each difference record is an Ordered Dict with possible keys
198
+ 'abs' or 'rel' showing maximal absolute or relative differences
199
+ in the file or the record ('CMP': 'incompat') if file shapes
200
+ are incompatible.
201
+ """
202
+
203
+ # we are doomed to keep them in RAM now
204
+ data = [f if isinstance (f , np .ndarray ) else nib .load (f ).get_fdata (dtype = dtype )
205
+ for f in files ]
206
+ diffs = OrderedDict ()
207
+ for i , d1 in enumerate (data [:- 1 ]):
208
+ # populate empty entries for non-compared
209
+ diffs1 = [None ] * (i + 1 )
210
+
211
+ for j , d2 in enumerate (data [i + 1 :], i + 1 ):
212
+
213
+ if d1 .shape == d2 .shape :
214
+ abs_diff = np .abs (d1 - d2 )
215
+ mean_abs = (np .abs (d1 ) + np .abs (d2 )) * 0.5
216
+ candidates = np .logical_or (mean_abs != 0 , abs_diff != 0 )
217
+
218
+ if max_abs :
219
+ candidates [abs_diff <= max_abs ] = False
220
+
221
+ max_abs_diff = np .max (abs_diff )
222
+ if np .any (candidates ):
223
+ rel_diff = abs_diff [candidates ] / mean_abs [candidates ]
224
+ if max_rel :
225
+ sub_thr = rel_diff <= max_rel
226
+ # Since we operated on sub-selected values already, we need
227
+ # to plug them back in
228
+ candidates [
229
+ tuple ((indexes [sub_thr ] for indexes in np .where (candidates )))
230
+ ] = False
231
+ max_rel_diff = np .max (rel_diff )
232
+ else :
233
+ max_rel_diff = 0
234
+
235
+ if np .any (candidates ):
236
+
237
+ diff_rec = OrderedDict () # so that abs goes before relative
238
+
239
+ diff_rec ['abs' ] = max_abs_diff .astype (dtype )
240
+ diff_rec ['rel' ] = max_rel_diff .astype (dtype )
241
+ diffs1 .append (diff_rec )
242
+ else :
243
+ diffs1 .append (None )
244
+
245
+ else :
246
+ diffs1 .append ({'CMP' : "incompat" })
247
+
248
+ if any (diffs1 ):
249
+
250
+ diffs ['DATA(diff %d:)' % (i + 1 )] = diffs1
251
+
252
+ return diffs
253
+
254
+
128
255
def display_diff (files , diff ):
129
256
"""Format header differences into a nice string
130
257
@@ -140,21 +267,27 @@ def display_diff(files, diff):
140
267
"""
141
268
output = ""
142
269
field_width = "{:<15}"
270
+ filename_width = "{:<53}"
143
271
value_width = "{:<55}"
144
272
145
273
output += "These files are different.\n "
146
- output += field_width .format ('Field' )
274
+ output += field_width .format ('Field/File ' )
147
275
148
- for f in files :
149
- output += value_width .format (os .path .basename (f ))
276
+ for i , f in enumerate ( files , 1 ) :
277
+ output += "%d:%s" % ( i , filename_width .format (os .path .basename (f ) ))
150
278
151
279
output += "\n "
152
280
153
281
for key , value in diff .items ():
154
282
output += field_width .format (key )
155
283
156
284
for item in value :
157
- item_str = str (item )
285
+ if isinstance (item , dict ):
286
+ item_str = ', ' .join ('%s: %s' % i for i in item .items ())
287
+ elif item is None :
288
+ item_str = '-'
289
+ else :
290
+ item_str = str (item )
158
291
# Value might start/end with some invisible spacing characters so we
159
292
# would "condition" it on both ends a bit
160
293
item_str = re .sub ('^[ \t ]+' , '<' , item_str )
@@ -169,8 +302,39 @@ def display_diff(files, diff):
169
302
return output
170
303
171
304
305
+ def diff (files , header_fields = 'all' , data_max_abs_diff = None , data_max_rel_diff = None ,
306
+ dtype = np .float64 ):
307
+ assert len (files ) >= 2 , "Please enter at least two files"
308
+
309
+ file_headers = [nib .load (f ).header for f in files ]
310
+
311
+ # signals "all fields"
312
+ if header_fields == 'all' :
313
+ # TODO: header fields might vary across file types, thus prior sensing would be needed
314
+ header_fields = file_headers [0 ].keys ()
315
+ else :
316
+ header_fields = header_fields .split (',' )
317
+
318
+ diff = get_headers_diff (file_headers , header_fields )
319
+
320
+ data_md5_diffs = get_data_hash_diff (files , dtype )
321
+ if data_md5_diffs :
322
+ # provide details, possibly triggering the ignore of the difference
323
+ # in data
324
+ data_diffs = get_data_diff (files ,
325
+ max_abs = data_max_abs_diff ,
326
+ max_rel = data_max_rel_diff ,
327
+ dtype = dtype )
328
+ if data_diffs :
329
+ diff ['DATA(md5)' ] = data_md5_diffs
330
+ diff .update (data_diffs )
331
+
332
+ return diff
333
+
334
+
172
335
def main (args = None , out = None ):
173
336
"""Getting the show on the road"""
337
+
174
338
out = out or sys .stdout
175
339
parser = get_opt_parser ()
176
340
(opts , files ) = parser .parse_args (args )
@@ -181,27 +345,17 @@ def main(args=None, out=None):
181
345
# suppress nibabel format-compliance warnings
182
346
nib .imageglobals .logger .level = 50
183
347
184
- assert len (files ) >= 2 , "Please enter at least two files"
185
-
186
- file_headers = [nib .load (f ).header for f in files ]
187
-
188
- # signals "all fields"
189
- if opts .header_fields == 'all' :
190
- # TODO: header fields might vary across file types, thus prior sensing would be needed
191
- header_fields = file_headers [0 ].keys ()
192
- else :
193
- header_fields = opts .header_fields .split (',' )
194
-
195
- diff = get_headers_diff (file_headers , header_fields )
196
- data_diff = get_data_diff (files )
197
-
198
- if data_diff :
199
- diff ['DATA(md5)' ] = data_diff
348
+ files_diff = diff (
349
+ files ,
350
+ header_fields = opts .header_fields ,
351
+ data_max_abs_diff = opts .data_max_abs_diff ,
352
+ data_max_rel_diff = opts .data_max_rel_diff ,
353
+ dtype = opts .dtype
354
+ )
200
355
201
- if diff :
202
- out .write (display_diff (files , diff ))
356
+ if files_diff :
357
+ out .write (display_diff (files , files_diff ))
203
358
raise SystemExit (1 )
204
-
205
359
else :
206
360
out .write ("These files are identical.\n " )
207
361
raise SystemExit (0 )
0 commit comments