diff --git a/.travis.yml b/.travis.yml index 9a0d227fdf..b6e69d09ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ cache: - $HOME/.cache/pip env: global: - - DEPENDS="six numpy scipy matplotlib h5py pillow pydicom hypothesis" + - DEPENDS="six numpy scipy matplotlib h5py pillow pydicom" - OPTIONAL_DEPENDS="" - INSTALL_TYPE="setup" - EXTRA_WHEELS="https://5cf40426d9f06eb7461d-6fe47d9331aba7cd62fc36c7196769e4.ssl.cf2.rackcdn.com" @@ -97,7 +97,7 @@ before_install: - source venv/bin/activate - python --version # just to check - pip install -U pip wheel # needed at one point - - retry pip install nose flake8 mock hypothesis # always + - retry pip install nose flake8 mock # always - pip install $EXTRA_PIP_FLAGS $DEPENDS $OPTIONAL_DEPENDS - if [ "${COVERAGE}" == "1" ]; then pip install coverage; diff --git a/appveyor.yml b/appveyor.yml index 05510ec886..772bfa142d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,7 +22,7 @@ install: - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH% # Install the dependencies of the project. - - pip install numpy scipy matplotlib nose h5py mock hypothesis pydicom + - pip install numpy scipy matplotlib nose h5py mock pydicom - pip install . - SET NIBABEL_DATA_DIR=%CD%\nibabel-data diff --git a/dev-requirements.txt b/dev-requirements.txt index 014b7a9d01..f63af96cf4 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,4 +2,3 @@ -r requirements.txt nose mock -hypothesis \ No newline at end of file diff --git a/nibabel/cmdline/diff.py b/nibabel/cmdline/diff.py index 21cd7b40a9..1901928cb7 100755 --- a/nibabel/cmdline/diff.py +++ b/nibabel/cmdline/diff.py @@ -39,31 +39,78 @@ def get_opt_parser(): Option("-H", "--header-fields", dest="header_fields", default='all', help="Header fields (comma separated) to be printed as well (if present)"), + + Option("--ma", "--data-max-abs-diff", + dest="data_max_abs_diff", + type=float, + default=0.0, + help="Maximal absolute difference in data between files to tolerate."), + + Option("--mr", "--data-max-rel-diff", + dest="data_max_rel_diff", + type=float, + default=0.0, + help="Maximal relative difference in data between files to tolerate." + " If --data-max-abs-diff is also specified, only the data points " + " with absolute difference greater than that value would be " + " considered for relative difference check."), + Option("--dt", "--datatype", + dest="dtype", + default=np.float64, + help="Enter a numpy datatype such as 'float32'.") ]) return p def are_values_different(*values): - """Generically compares values, returns true if different""" - value0 = values[0] - values = values[1:] # to ensure that the first value isn't compared with itself - - for value in values: - try: # we sometimes don't want NaN values - if np.any(np.isnan(value0)) and np.any(np.isnan(value)): # if they're both NaN - break - elif np.any(np.isnan(value0)) or np.any(np.isnan(value)): # if only 1 is NaN - return True + """Generically compare values, return True if different - except TypeError: - pass + Note that comparison is targetting reporting of comparison of the headers + so has following specifics: + - even a difference in data types is considered a difference, i.e. 1 != 1.0 + - nans are considered to be the "same", although generally nan != nan + """ + value0 = values[0] + # to not recompute over again + if isinstance(value0, np.ndarray): + try: + # np.asarray for elderly numpys, e.g. 1.7.1 where for + # degenerate arrays (shape ()) it would return a pure scalar + value0_nans = np.asanyarray(np.isnan(value0)) + value0_nonnans = np.asanyarray(np.logical_not(value0_nans)) + # if value0_nans.size == 1: + # import pdb; pdb.set_trace() + if not np.any(value0_nans): + value0_nans = None + except TypeError as exc: + str_exc = str(exc) + # Not implemented in numpy 1.7.1 + if "not supported" in str_exc or "ot implemented" in str_exc: + value0_nans = None + else: + raise + + for value in values[1:]: if type(value0) != type(value): # if types are different, then we consider them different return True elif isinstance(value0, np.ndarray): - return np.any(value0 != value) - + if value0.dtype != value.dtype or \ + value0.shape != value.shape: + return True + # there might be nans and they need special treatment + if value0_nans is not None: + value_nans = np.isnan(value) + if np.any(value0_nans != value_nans): + return True + if np.any(value0[value0_nonnans] != value[value0_nonnans]): + return True + elif np.any(value0 != value): + return True + elif value0 is np.nan: + if value is not np.nan: + return True elif value0 != value: return True @@ -101,8 +148,8 @@ def get_headers_diff(file_headers, names=None): return difference -def get_data_diff(files): - """Get difference between md5 values +def get_data_hash_diff(files, dtype=np.float64): + """Get difference between md5 values of data Parameters ---------- @@ -115,7 +162,7 @@ def get_data_diff(files): """ md5sums = [ - hashlib.md5(np.ascontiguousarray(nib.load(f).get_data(), dtype=np.float32)).hexdigest() + hashlib.md5(np.ascontiguousarray(nib.load(f).get_fdata(dtype=dtype))).hexdigest() for f in files ] @@ -125,6 +172,86 @@ def get_data_diff(files): return md5sums +def get_data_diff(files, max_abs=0, max_rel=0, dtype=np.float64): + """Get difference between data + + Parameters + ---------- + files: list of (str or ndarray) + If list of strings is provided -- they must be existing file names + max_abs: float, optional + Maximal absolute difference to tolerate. + max_rel: float, optional + Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate. + If `max_abs` is specified, then those data points with lesser than that + absolute difference, are not considered for relative difference testing + dtype: np, optional + Datatype to be used when extracting data from files + + Returns + ------- + diffs: OrderedDict + An ordered dict with a record per each file which has differences + with other files subsequent detected. Each record is a list of + difference records, one per each file pair. + Each difference record is an Ordered Dict with possible keys + 'abs' or 'rel' showing maximal absolute or relative differences + in the file or the record ('CMP': 'incompat') if file shapes + are incompatible. + """ + + # we are doomed to keep them in RAM now + data = [f if isinstance(f, np.ndarray) else nib.load(f).get_fdata(dtype=dtype) + for f in files] + diffs = OrderedDict() + for i, d1 in enumerate(data[:-1]): + # populate empty entries for non-compared + diffs1 = [None] * (i + 1) + + for j, d2 in enumerate(data[i + 1:], i + 1): + + if d1.shape == d2.shape: + abs_diff = np.abs(d1 - d2) + mean_abs = (np.abs(d1) + np.abs(d2)) * 0.5 + candidates = np.logical_or(mean_abs != 0, abs_diff != 0) + + if max_abs: + candidates[abs_diff <= max_abs] = False + + max_abs_diff = np.max(abs_diff) + if np.any(candidates): + rel_diff = abs_diff[candidates] / mean_abs[candidates] + if max_rel: + sub_thr = rel_diff <= max_rel + # Since we operated on sub-selected values already, we need + # to plug them back in + candidates[ + tuple((indexes[sub_thr] for indexes in np.where(candidates))) + ] = False + max_rel_diff = np.max(rel_diff) + else: + max_rel_diff = 0 + + if np.any(candidates): + + diff_rec = OrderedDict() # so that abs goes before relative + + diff_rec['abs'] = max_abs_diff.astype(dtype) + diff_rec['rel'] = max_rel_diff.astype(dtype) + diffs1.append(diff_rec) + else: + diffs1.append(None) + + else: + diffs1.append({'CMP': "incompat"}) + + if any(diffs1): + + diffs['DATA(diff %d:)' % (i + 1)] = diffs1 + + return diffs + + def display_diff(files, diff): """Format header differences into a nice string @@ -140,13 +267,14 @@ def display_diff(files, diff): """ output = "" field_width = "{:<15}" + filename_width = "{:<53}" value_width = "{:<55}" output += "These files are different.\n" - output += field_width.format('Field') + output += field_width.format('Field/File') - for f in files: - output += value_width.format(os.path.basename(f)) + for i, f in enumerate(files, 1): + output += "%d:%s" % (i, filename_width.format(os.path.basename(f))) output += "\n" @@ -154,7 +282,12 @@ def display_diff(files, diff): output += field_width.format(key) for item in value: - item_str = str(item) + if isinstance(item, dict): + item_str = ', '.join('%s: %s' % i for i in item.items()) + elif item is None: + item_str = '-' + else: + item_str = str(item) # Value might start/end with some invisible spacing characters so we # would "condition" it on both ends a bit item_str = re.sub('^[ \t]+', '<', item_str) @@ -169,8 +302,39 @@ def display_diff(files, diff): return output +def diff(files, header_fields='all', data_max_abs_diff=None, data_max_rel_diff=None, + dtype=np.float64): + assert len(files) >= 2, "Please enter at least two files" + + file_headers = [nib.load(f).header for f in files] + + # signals "all fields" + if header_fields == 'all': + # TODO: header fields might vary across file types, thus prior sensing would be needed + header_fields = file_headers[0].keys() + else: + header_fields = header_fields.split(',') + + diff = get_headers_diff(file_headers, header_fields) + + data_md5_diffs = get_data_hash_diff(files, dtype) + if data_md5_diffs: + # provide details, possibly triggering the ignore of the difference + # in data + data_diffs = get_data_diff(files, + max_abs=data_max_abs_diff, + max_rel=data_max_rel_diff, + dtype=dtype) + if data_diffs: + diff['DATA(md5)'] = data_md5_diffs + diff.update(data_diffs) + + return diff + + def main(args=None, out=None): """Getting the show on the road""" + out = out or sys.stdout parser = get_opt_parser() (opts, files) = parser.parse_args(args) @@ -181,27 +345,17 @@ def main(args=None, out=None): # suppress nibabel format-compliance warnings nib.imageglobals.logger.level = 50 - assert len(files) >= 2, "Please enter at least two files" - - file_headers = [nib.load(f).header for f in files] - - # signals "all fields" - if opts.header_fields == 'all': - # TODO: header fields might vary across file types, thus prior sensing would be needed - header_fields = file_headers[0].keys() - else: - header_fields = opts.header_fields.split(',') - - diff = get_headers_diff(file_headers, header_fields) - data_diff = get_data_diff(files) - - if data_diff: - diff['DATA(md5)'] = data_diff + files_diff = diff( + files, + header_fields=opts.header_fields, + data_max_abs_diff=opts.data_max_abs_diff, + data_max_rel_diff=opts.data_max_rel_diff, + dtype=opts.dtype + ) - if diff: - out.write(display_diff(files, diff)) + if files_diff: + out.write(display_diff(files, files_diff)) raise SystemExit(1) - else: out.write("These files are identical.\n") raise SystemExit(0) diff --git a/nibabel/cmdline/tests/test_utils.py b/nibabel/cmdline/tests/test_utils.py index 4aa387b6e5..e701925870 100644 --- a/nibabel/cmdline/tests/test_utils.py +++ b/nibabel/cmdline/tests/test_utils.py @@ -11,7 +11,7 @@ import nibabel as nib import numpy as np from nibabel.cmdline.utils import * -from nibabel.cmdline.diff import get_headers_diff, display_diff, main, get_data_diff +from nibabel.cmdline.diff import * from os.path import (join as pjoin) from nibabel.testing import data_path from collections import OrderedDict @@ -96,9 +96,9 @@ def test_display_diff(): ("bitpix", [np.array(8).astype(dtype="uint8"), np.array(16).astype(dtype="uint8")]) ]) - expected_output = "These files are different.\n" + "Field hellokitty.nii.gz" \ - " " \ - "privettovarish.nii.gz \n" \ + expected_output = "These files are different.\n" + "Field/File 1:hellokitty.nii.gz" \ + " " \ + "2:privettovarish.nii.gz \n" \ "datatype " \ "2 " \ "4 \n" \ @@ -114,7 +114,47 @@ def test_get_data_diff(): # testing for identical files specifically as md5 may vary by computer test_names = [pjoin(data_path, f) for f in ('standard.nii.gz', 'standard.nii.gz')] - assert_equal(get_data_diff(test_names), []) + assert_equal(get_data_hash_diff(test_names), []) + + # testing the maximum relative and absolute differences' different use cases + test_array = np.arange(16).reshape(4, 4) + test_array_2 = np.arange(1, 17).reshape(4, 4) + test_array_3 = np.arange(2, 18).reshape(4, 4) + test_array_4 = np.arange(100).reshape(10, 10) + test_array_5 = np.arange(64).reshape(8, 8) + + # same shape, 2 files + assert_equal(get_data_diff([test_array, test_array_2]), + OrderedDict([('DATA(diff 1:)', [None, OrderedDict([('abs', 1), ('rel', 2.0)])])])) + + # same shape, 3 files + assert_equal(get_data_diff([test_array, test_array_2, test_array_3]), + OrderedDict([('DATA(diff 1:)', [None, OrderedDict([('abs', 1), ('rel', 2.0)]), + OrderedDict([('abs', 2), ('rel', 2.0)])]), + ('DATA(diff 2:)', [None, None, + OrderedDict([('abs', 1), ('rel', 0.66666666666666663)])])])) + + # same shape, 2 files, modified maximum abs/rel + assert_equal(get_data_diff([test_array, test_array_2], max_abs=2, max_rel=2), OrderedDict()) + + # different shape, 2 files + assert_equal(get_data_diff([test_array_2, test_array_4]), + OrderedDict([('DATA(diff 1:)', [None, {'CMP': 'incompat'}])])) + + # different shape, 3 files + assert_equal(get_data_diff([test_array_4, test_array_5, test_array_2]), + OrderedDict([('DATA(diff 1:)', [None, {'CMP': 'incompat'}, {'CMP': 'incompat'}]), + ('DATA(diff 2:)', [None, None, {'CMP': 'incompat'}])])) + + test_return = get_data_diff([test_array, test_array_2], dtype=np.float32) + assert_equal(type(test_return['DATA(diff 1:)'][1]['abs']), np.float32) + assert_equal(type(test_return['DATA(diff 1:)'][1]['rel']), np.float32) + + test_return_2 = get_data_diff([test_array, test_array_2, test_array_3]) + assert_equal(type(test_return_2['DATA(diff 1:)'][1]['abs']), np.float64) + assert_equal(type(test_return_2['DATA(diff 1:)'][1]['rel']), np.float64) + assert_equal(type(test_return_2['DATA(diff 2:)'][2]['abs']), np.float64) + assert_equal(type(test_return_2['DATA(diff 2:)'][2]['rel']), np.float64) def test_main(): diff --git a/nibabel/tests/test_diff.py b/nibabel/tests/test_diff.py index 2dd1ef9b93..4f99ca145f 100644 --- a/nibabel/tests/test_diff.py +++ b/nibabel/tests/test_diff.py @@ -7,62 +7,68 @@ from os.path import (dirname, join as pjoin, abspath) import numpy as np -from hypothesis import given -import hypothesis.strategies as st - DATA_PATH = abspath(pjoin(dirname(__file__), 'data')) from nibabel.cmdline.diff import are_values_different -# TODO: MAJOR TO DO IS TO FIGURE OUT HOW TO USE HYPOTHESIS FOR LONGER LIST LENGTHS WHILE STILL CONTROLLING FOR OUTCOMES - - -@given(st.data()) -def test_diff_values_int(data): - x = data.draw(st.integers(), label='x') - y = data.draw(st.integers(min_value=x + 1), label='x+1') - z = data.draw(st.integers(max_value=x - 1), label='x-1') - - assert not are_values_different(x, x) - assert are_values_different(x, y) - assert are_values_different(x, z) - assert are_values_different(y, z) +def test_diff_values_int(): + large = 10**30 + assert not are_values_different(0, 0) + assert not are_values_different(1, 1) + assert not are_values_different(large, large) + assert are_values_different(0, 1) + assert are_values_different(1, 2) + assert are_values_different(1, large) -@given(st.data()) -def test_diff_values_float(data): - x = data.draw(st.just(0), label='x') - y = data.draw(st.floats(min_value=1e8), label='y') - z = data.draw(st.floats(max_value=-1e8), label='z') - assert not are_values_different(x, x) - assert are_values_different(x, y) - assert are_values_different(x, z) - assert are_values_different(y, z) +def test_diff_values_float(): + assert not are_values_different(0., 0.) + assert not are_values_different(0., 0., 0.) # can take more + assert not are_values_different(1.1, 1.1) + assert are_values_different(0., 1.1) + assert are_values_different(0., 0, 1.1) + assert are_values_different(1., 2.) -@given(st.data()) -def test_diff_values_mixed(data): - type_float = data.draw(st.floats(), label='float') - type_int = data.draw(st.integers(), label='int') - type_none = data.draw(st.none(), label='none') - - assert are_values_different(type_float, type_int) - assert are_values_different(type_float, type_none) - assert are_values_different(type_int, type_none) +def test_diff_values_mixed(): + assert are_values_different(1.0, 1) + assert are_values_different(1.0, "1") + assert are_values_different(1, "1") + assert are_values_different(1, None) assert are_values_different(np.ndarray([0]), 'hey') - assert not are_values_different(type_none, type_none) - - -@given(st.data()) -def test_diff_values_array(data): - a = data.draw(st.lists(elements=st.integers(min_value=0), min_size=1)) - b = data.draw(st.lists(elements=st.integers(max_value=-1), min_size=1)) - c = data.draw(st.lists(elements=st.floats(min_value=1e8), min_size=1)) - d = data.draw(st.lists(elements=st.floats(max_value=-1e8), min_size=1)) - # TODO: Figure out a way to include 0 in lists (arrays) - - assert are_values_different(a, b) - assert are_values_different(c, d) - assert not are_values_different(a, a) + assert not are_values_different(None, None) + + +def test_diff_values_array(): + from numpy import nan, array, inf + a_int = array([1, 2]) + a_float = a_int.astype(float) + + assert are_values_different(a_int, a_float) + assert are_values_different(a_int, a_int, a_float) + assert are_values_different(np.arange(3), np.arange(1, 4)) + assert are_values_different(np.arange(3), np.arange(4)) + assert are_values_different(np.arange(4), np.arange(4).reshape((2, 2))) + # no broadcasting should kick in - shape difference + assert are_values_different(array([1]), array([1, 1])) + assert not are_values_different(a_int, a_int) + assert not are_values_different(a_float, a_float) + + # nans - we consider them "the same" for the purpose of these comparisons + assert not are_values_different(nan, nan) + assert not are_values_different(nan, nan, nan) + assert are_values_different(nan, nan, 1) + assert are_values_different(1, nan, nan) + assert not are_values_different(array([nan, nan]), array([nan, nan])) + assert not are_values_different(array([nan, nan]), array([nan, nan]), array([nan, nan])) + assert not are_values_different(array([nan, 1]), array([nan, 1])) + assert are_values_different(array([nan, nan]), array([nan, 1])) + assert are_values_different(array([0, nan]), array([nan, 0])) + assert are_values_different(array([1, 2, 3, nan]), array([nan, 3, 5, 4])) + assert are_values_different(nan, 1.0) + assert are_values_different(array([1, 2, 3, nan]), array([3, 4, 5, nan])) + # and some inf should not be a problem + assert not are_values_different(array([0, inf]), array([0, inf])) + assert are_values_different(array([0, inf]), array([inf, 0])) diff --git a/nibabel/tests/test_scripts.py b/nibabel/tests/test_scripts.py index 0aa404a939..a734fbeeda 100644 --- a/nibabel/tests/test_scripts.py +++ b/nibabel/tests/test_scripts.py @@ -72,10 +72,10 @@ def check_nib_diff_examples(): fnames = [pjoin(DATA_PATH, f) for f in ('standard.nii.gz', 'example4d.nii.gz')] code, stdout, stderr = run_command(['nib-diff'] + fnames, check_code=False) - checked_fields = ["Field", "regular", "dim_info", "dim", "datatype", "bitpix", "pixdim", "slice_end", + checked_fields = ["Field/File", "regular", "dim_info", "dim", "datatype", "bitpix", "pixdim", "slice_end", "xyzt_units", "cal_max", "descrip", "qform_code", "sform_code", "quatern_b", "quatern_c", "quatern_d", "qoffset_x", "qoffset_y", "qoffset_z", "srow_x", - "srow_y", "srow_z", "DATA(md5)"] + "srow_y", "srow_z", "DATA(md5)", "DATA(diff 1:)"] for item in checked_fields: assert_true(item in stdout)