Skip to content

RF+BF: Add tolerances and data types for nib-diff, remove hypothesis dependency #678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Oct 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
cd5a741
RF+ENH: nib-diff - allow to specify absolute and/or relative maximal …
yarikoptic Sep 14, 2018
018eceb
ENH: nib-diff Field/File not just Field in the header
yarikoptic Sep 14, 2018
833b4df
changed as commented out in the pull request
chrispycheng Sep 21, 2018
9b123f6
changes as commented
chrispycheng Sep 21, 2018
1e33ea7
RF: anticipated files of different shapes, fixed table display, corre…
chrispycheng Sep 25, 2018
76ca32f
elaborated docstring, modified get_data_diff to allow direct array in…
chrispycheng Sep 27, 2018
0aa6370
added to diff documentation, undid executable change, took out debugg…
chrispycheng Sep 28, 2018
d057249
undid permission snafu on test_scripts
chrispycheng Sep 28, 2018
76ee358
docstring and function name clarification, change get_data to get_fda…
chrispycheng Oct 2, 2018
034c276
corrected styles per Travis, limited fdata to float32
chrispycheng Oct 2, 2018
19fcdd5
STY: Break overly-long line
effigies Oct 3, 2018
93c7bb6
prepared for future PR to allow modification of dtype used in diff co…
chrispycheng Oct 3, 2018
cd85e09
Merge branch 'enh-diff' of https://github.com/yarikoptic/nibabel into…
chrispycheng Oct 3, 2018
97ead00
added cmdline functionality for modifying datatype used for file data…
chrispycheng Oct 3, 2018
2340a92
added testing for dtype at cmdline, assuming cmdline functionality. c…
chrispycheng Oct 4, 2018
e8d3a4f
RF: do not use hypothesis
yarikoptic Oct 6, 2018
09cfef1
BF: properly compare NaNs and test for >2 arguments given to are_valu…
yarikoptic Oct 6, 2018
1996ead
RF: long -> large, NaN -> nan
yarikoptic Oct 8, 2018
0fb8920
BF: reintroduce try/except guarding around initial np.isnan
yarikoptic Oct 8, 2018
e4c5b52
BF: exception string matching for not implemented in numpy 1.7.1
yarikoptic Oct 8, 2018
e87d581
BF: another workaround for elderly numpy 1.7.1
yarikoptic Oct 8, 2018
f973ce4
BF(DOC): fixed damn trailing whitespaces in a docsting
yarikoptic Oct 9, 2018
22f89e9
Merge branch 'no-hypothesis' of https://github.com/yarikoptic/nibabel…
chrispycheng Oct 10, 2018
3636c4d
added tests to fix coverage problem
chrispycheng Oct 11, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ cache:
- $HOME/.cache/pip
env:
global:
- DEPENDS="six numpy scipy matplotlib h5py pillow pydicom hypothesis"
- DEPENDS="six numpy scipy matplotlib h5py pillow pydicom"
- OPTIONAL_DEPENDS=""
- INSTALL_TYPE="setup"
- EXTRA_WHEELS="https://5cf40426d9f06eb7461d-6fe47d9331aba7cd62fc36c7196769e4.ssl.cf2.rackcdn.com"
Expand Down Expand Up @@ -97,7 +97,7 @@ before_install:
- source venv/bin/activate
- python --version # just to check
- pip install -U pip wheel # needed at one point
- retry pip install nose flake8 mock hypothesis # always
- retry pip install nose flake8 mock # always
- pip install $EXTRA_PIP_FLAGS $DEPENDS $OPTIONAL_DEPENDS
- if [ "${COVERAGE}" == "1" ]; then
pip install coverage;
Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ install:
- SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%

# Install the dependencies of the project.
- pip install numpy scipy matplotlib nose h5py mock hypothesis pydicom
- pip install numpy scipy matplotlib nose h5py mock pydicom
- pip install .
- SET NIBABEL_DATA_DIR=%CD%\nibabel-data

Expand Down
1 change: 0 additions & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@
-r requirements.txt
nose
mock
hypothesis
234 changes: 194 additions & 40 deletions nibabel/cmdline/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,31 +39,78 @@ def get_opt_parser():
Option("-H", "--header-fields",
dest="header_fields", default='all',
help="Header fields (comma separated) to be printed as well (if present)"),

Option("--ma", "--data-max-abs-diff",
dest="data_max_abs_diff",
type=float,
default=0.0,
help="Maximal absolute difference in data between files to tolerate."),

Option("--mr", "--data-max-rel-diff",
dest="data_max_rel_diff",
type=float,
default=0.0,
help="Maximal relative difference in data between files to tolerate."
" If --data-max-abs-diff is also specified, only the data points "
" with absolute difference greater than that value would be "
" considered for relative difference check."),
Option("--dt", "--datatype",
dest="dtype",
default=np.float64,
help="Enter a numpy datatype such as 'float32'.")
])

return p


def are_values_different(*values):
"""Generically compares values, returns true if different"""
value0 = values[0]
values = values[1:] # to ensure that the first value isn't compared with itself

for value in values:
try: # we sometimes don't want NaN values
if np.any(np.isnan(value0)) and np.any(np.isnan(value)): # if they're both NaN
break
elif np.any(np.isnan(value0)) or np.any(np.isnan(value)): # if only 1 is NaN
return True
"""Generically compare values, return True if different

except TypeError:
pass
Note that comparison is targetting reporting of comparison of the headers
so has following specifics:
- even a difference in data types is considered a difference, i.e. 1 != 1.0
- nans are considered to be the "same", although generally nan != nan
"""
value0 = values[0]

# to not recompute over again
if isinstance(value0, np.ndarray):
try:
# np.asarray for elderly numpys, e.g. 1.7.1 where for
# degenerate arrays (shape ()) it would return a pure scalar
value0_nans = np.asanyarray(np.isnan(value0))
value0_nonnans = np.asanyarray(np.logical_not(value0_nans))
# if value0_nans.size == 1:
# import pdb; pdb.set_trace()
if not np.any(value0_nans):
value0_nans = None
except TypeError as exc:
str_exc = str(exc)
# Not implemented in numpy 1.7.1
if "not supported" in str_exc or "ot implemented" in str_exc:
value0_nans = None
else:
raise

for value in values[1:]:
if type(value0) != type(value): # if types are different, then we consider them different
return True
elif isinstance(value0, np.ndarray):
return np.any(value0 != value)

if value0.dtype != value.dtype or \
value0.shape != value.shape:
return True
# there might be nans and they need special treatment
if value0_nans is not None:
value_nans = np.isnan(value)
if np.any(value0_nans != value_nans):
return True
if np.any(value0[value0_nonnans] != value[value0_nonnans]):
return True
elif np.any(value0 != value):
return True
elif value0 is np.nan:
if value is not np.nan:
return True
elif value0 != value:
return True

Expand Down Expand Up @@ -101,8 +148,8 @@ def get_headers_diff(file_headers, names=None):
return difference


def get_data_diff(files):
"""Get difference between md5 values
def get_data_hash_diff(files, dtype=np.float64):
"""Get difference between md5 values of data

Parameters
----------
Expand All @@ -115,7 +162,7 @@ def get_data_diff(files):
"""

md5sums = [
hashlib.md5(np.ascontiguousarray(nib.load(f).get_data(), dtype=np.float32)).hexdigest()
hashlib.md5(np.ascontiguousarray(nib.load(f).get_fdata(dtype=dtype))).hexdigest()
for f in files
]

Expand All @@ -125,6 +172,86 @@ def get_data_diff(files):
return md5sums


def get_data_diff(files, max_abs=0, max_rel=0, dtype=np.float64):
"""Get difference between data

Parameters
----------
files: list of (str or ndarray)
If list of strings is provided -- they must be existing file names
max_abs: float, optional
Maximal absolute difference to tolerate.
max_rel: float, optional
Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate.
If `max_abs` is specified, then those data points with lesser than that
absolute difference, are not considered for relative difference testing
dtype: np, optional
Datatype to be used when extracting data from files

Returns
-------
diffs: OrderedDict
An ordered dict with a record per each file which has differences
with other files subsequent detected. Each record is a list of
difference records, one per each file pair.
Each difference record is an Ordered Dict with possible keys
'abs' or 'rel' showing maximal absolute or relative differences
in the file or the record ('CMP': 'incompat') if file shapes
are incompatible.
"""

# we are doomed to keep them in RAM now
data = [f if isinstance(f, np.ndarray) else nib.load(f).get_fdata(dtype=dtype)
for f in files]
diffs = OrderedDict()
for i, d1 in enumerate(data[:-1]):
# populate empty entries for non-compared
diffs1 = [None] * (i + 1)

for j, d2 in enumerate(data[i + 1:], i + 1):

if d1.shape == d2.shape:
abs_diff = np.abs(d1 - d2)
mean_abs = (np.abs(d1) + np.abs(d2)) * 0.5
candidates = np.logical_or(mean_abs != 0, abs_diff != 0)

if max_abs:
candidates[abs_diff <= max_abs] = False

max_abs_diff = np.max(abs_diff)
if np.any(candidates):
rel_diff = abs_diff[candidates] / mean_abs[candidates]
if max_rel:
sub_thr = rel_diff <= max_rel
# Since we operated on sub-selected values already, we need
# to plug them back in
candidates[
tuple((indexes[sub_thr] for indexes in np.where(candidates)))
] = False
max_rel_diff = np.max(rel_diff)
else:
max_rel_diff = 0

if np.any(candidates):

diff_rec = OrderedDict() # so that abs goes before relative

diff_rec['abs'] = max_abs_diff.astype(dtype)
diff_rec['rel'] = max_rel_diff.astype(dtype)
diffs1.append(diff_rec)
else:
diffs1.append(None)

else:
diffs1.append({'CMP': "incompat"})

if any(diffs1):

diffs['DATA(diff %d:)' % (i + 1)] = diffs1

return diffs


def display_diff(files, diff):
"""Format header differences into a nice string

Expand All @@ -140,21 +267,27 @@ def display_diff(files, diff):
"""
output = ""
field_width = "{:<15}"
filename_width = "{:<53}"
value_width = "{:<55}"

output += "These files are different.\n"
output += field_width.format('Field')
output += field_width.format('Field/File')

for f in files:
output += value_width.format(os.path.basename(f))
for i, f in enumerate(files, 1):
output += "%d:%s" % (i, filename_width.format(os.path.basename(f)))

output += "\n"

for key, value in diff.items():
output += field_width.format(key)

for item in value:
item_str = str(item)
if isinstance(item, dict):
item_str = ', '.join('%s: %s' % i for i in item.items())
elif item is None:
item_str = '-'
else:
item_str = str(item)
# Value might start/end with some invisible spacing characters so we
# would "condition" it on both ends a bit
item_str = re.sub('^[ \t]+', '<', item_str)
Expand All @@ -169,8 +302,39 @@ def display_diff(files, diff):
return output


def diff(files, header_fields='all', data_max_abs_diff=None, data_max_rel_diff=None,
dtype=np.float64):
assert len(files) >= 2, "Please enter at least two files"

file_headers = [nib.load(f).header for f in files]

# signals "all fields"
if header_fields == 'all':
# TODO: header fields might vary across file types, thus prior sensing would be needed
header_fields = file_headers[0].keys()
else:
header_fields = header_fields.split(',')

diff = get_headers_diff(file_headers, header_fields)

data_md5_diffs = get_data_hash_diff(files, dtype)
if data_md5_diffs:
# provide details, possibly triggering the ignore of the difference
# in data
data_diffs = get_data_diff(files,
max_abs=data_max_abs_diff,
max_rel=data_max_rel_diff,
dtype=dtype)
if data_diffs:
diff['DATA(md5)'] = data_md5_diffs
diff.update(data_diffs)

return diff


def main(args=None, out=None):
"""Getting the show on the road"""

out = out or sys.stdout
parser = get_opt_parser()
(opts, files) = parser.parse_args(args)
Expand All @@ -181,27 +345,17 @@ def main(args=None, out=None):
# suppress nibabel format-compliance warnings
nib.imageglobals.logger.level = 50

assert len(files) >= 2, "Please enter at least two files"

file_headers = [nib.load(f).header for f in files]

# signals "all fields"
if opts.header_fields == 'all':
# TODO: header fields might vary across file types, thus prior sensing would be needed
header_fields = file_headers[0].keys()
else:
header_fields = opts.header_fields.split(',')

diff = get_headers_diff(file_headers, header_fields)
data_diff = get_data_diff(files)

if data_diff:
diff['DATA(md5)'] = data_diff
files_diff = diff(
files,
header_fields=opts.header_fields,
data_max_abs_diff=opts.data_max_abs_diff,
data_max_rel_diff=opts.data_max_rel_diff,
dtype=opts.dtype
)

if diff:
out.write(display_diff(files, diff))
if files_diff:
out.write(display_diff(files, files_diff))
raise SystemExit(1)

else:
out.write("These files are identical.\n")
raise SystemExit(0)
Loading