nipy · yarikoptic · Oct 11, 2018 · Sep 14, 2018 · Sep 14, 2018 · Sep 21, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,7 @@ cache:
     - $HOME/.cache/pip
 env:
     global:
-        - DEPENDS="six numpy scipy matplotlib h5py pillow pydicom hypothesis"
+        - DEPENDS="six numpy scipy matplotlib h5py pillow pydicom"
         - OPTIONAL_DEPENDS=""
         - INSTALL_TYPE="setup"
         - EXTRA_WHEELS="https://5cf40426d9f06eb7461d-6fe47d9331aba7cd62fc36c7196769e4.ssl.cf2.rackcdn.com"
@@ -97,7 +97,7 @@ before_install:
     - source venv/bin/activate
     - python --version # just to check
     - pip install -U pip wheel  # needed at one point
-    - retry pip install nose flake8 mock hypothesis  # always
+    - retry pip install nose flake8 mock  # always
     - pip install $EXTRA_PIP_FLAGS $DEPENDS $OPTIONAL_DEPENDS
     - if [ "${COVERAGE}" == "1" ]; then
       pip install coverage;

diff --git a/appveyor.yml b/appveyor.yml
@@ -22,7 +22,7 @@ install:
   - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%
 
   # Install the dependencies of the project.
-  - pip install numpy scipy matplotlib nose h5py mock hypothesis pydicom
+  - pip install numpy scipy matplotlib nose h5py mock pydicom
   - pip install .
   - SET NIBABEL_DATA_DIR=%CD%\nibabel-data
 

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -2,4 +2,3 @@
 -r requirements.txt
 nose
 mock
-hypothesis
diff --git a/nibabel/cmdline/diff.py b/nibabel/cmdline/diff.py
@@ -39,31 +39,78 @@ def get_opt_parser():
         Option("-H", "--header-fields",
                dest="header_fields", default='all',
                help="Header fields (comma separated) to be printed as well (if present)"),
+
+        Option("--ma", "--data-max-abs-diff",
+               dest="data_max_abs_diff",
+               type=float,
+               default=0.0,
+               help="Maximal absolute difference in data between files to tolerate."),
+
+        Option("--mr", "--data-max-rel-diff",
+               dest="data_max_rel_diff",
+               type=float,
+               default=0.0,
+               help="Maximal relative difference in data between files to tolerate."
+                    " If --data-max-abs-diff is also specified, only the data points "
+                    " with absolute difference greater than that value would be "
+                    " considered for relative difference check."),
+        Option("--dt", "--datatype",
+               dest="dtype",
+               default=np.float64,
+               help="Enter a numpy datatype such as 'float32'.")
     ])
 
     return p
 
 
 def are_values_different(*values):
-    """Generically compares values, returns true if different"""
-    value0 = values[0]
-    values = values[1:]  # to ensure that the first value isn't compared with itself
-
-    for value in values:
-        try:  # we sometimes don't want NaN values
-            if np.any(np.isnan(value0)) and np.any(np.isnan(value)):  # if they're both NaN
-                break
-            elif np.any(np.isnan(value0)) or np.any(np.isnan(value)):  # if only 1 is NaN
-                return True
+    """Generically compare values, return True if different
 
-        except TypeError:
-            pass
+    Note that comparison is targetting reporting of comparison of the headers
+    so has following specifics:
+    - even a difference in data types is considered a difference, i.e. 1 != 1.0
+    - nans are considered to be the "same", although generally nan != nan
+    """
+    value0 = values[0]
 
+    # to not recompute over again
+    if isinstance(value0, np.ndarray):
+        try:
+            # np.asarray for elderly numpys, e.g. 1.7.1 where for
+            # degenerate arrays (shape ()) it would return a pure scalar
+            value0_nans = np.asanyarray(np.isnan(value0))
+            value0_nonnans = np.asanyarray(np.logical_not(value0_nans))
+            # if value0_nans.size == 1:
+            #     import pdb; pdb.set_trace()
+            if not np.any(value0_nans):
+                value0_nans = None
+        except TypeError as exc:
+            str_exc = str(exc)
+            # Not implemented in numpy 1.7.1
+            if "not supported" in str_exc or "ot implemented" in str_exc:
+                value0_nans = None
+            else:
+                raise
+
+    for value in values[1:]:
         if type(value0) != type(value):  # if types are different, then we consider them different
             return True
         elif isinstance(value0, np.ndarray):
-            return np.any(value0 != value)
-
+            if value0.dtype != value.dtype or \
+               value0.shape != value.shape:
+                return True
+            # there might be nans and they need special treatment
+            if value0_nans is not None:
+                value_nans = np.isnan(value)
+                if np.any(value0_nans != value_nans):
+                    return True
+                if np.any(value0[value0_nonnans] != value[value0_nonnans]):
+                    return True
+            elif np.any(value0 != value):
+                return True
+        elif value0 is np.nan:
+            if value is not np.nan:
+                return True
         elif value0 != value:
             return True
 
@@ -101,8 +148,8 @@ def get_headers_diff(file_headers, names=None):
     return difference
 
 
-def get_data_diff(files):
-    """Get difference between md5 values
+def get_data_hash_diff(files, dtype=np.float64):
+    """Get difference between md5 values of data
 
         Parameters
         ----------
@@ -115,7 +162,7 @@ def get_data_diff(files):
         """
 
     md5sums = [
-        hashlib.md5(np.ascontiguousarray(nib.load(f).get_data(), dtype=np.float32)).hexdigest()
+        hashlib.md5(np.ascontiguousarray(nib.load(f).get_fdata(dtype=dtype))).hexdigest()
         for f in files
     ]
 
@@ -125,6 +172,86 @@ def get_data_diff(files):
     return md5sums
 
 
+def get_data_diff(files, max_abs=0, max_rel=0, dtype=np.float64):
+    """Get difference between data
+
+    Parameters
+    ----------
+    files: list of (str or ndarray)
+      If list of strings is provided -- they must be existing file names
+    max_abs: float, optional
+      Maximal absolute difference to tolerate.
+    max_rel: float, optional
+      Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate.
+      If `max_abs` is specified, then those data points with lesser than that
+      absolute difference, are not considered for relative difference testing
+    dtype: np, optional
+      Datatype to be used when extracting data from files
+
+    Returns
+    -------
+    diffs: OrderedDict
+        An ordered dict with a record per each file which has differences
+        with other files subsequent detected. Each record is a list of
+        difference records, one per each file pair.
+        Each difference record is an Ordered Dict with possible keys
+        'abs' or 'rel' showing maximal absolute or relative differences
+        in the file or the record ('CMP': 'incompat') if file shapes
+        are incompatible.
+    """
+
+    # we are doomed to keep them in RAM now
+    data = [f if isinstance(f, np.ndarray) else nib.load(f).get_fdata(dtype=dtype)
+            for f in files]
+    diffs = OrderedDict()
+    for i, d1 in enumerate(data[:-1]):
+        # populate empty entries for non-compared
+        diffs1 = [None] * (i + 1)
+
+        for j, d2 in enumerate(data[i + 1:], i + 1):
+
+            if d1.shape == d2.shape:
+                abs_diff = np.abs(d1 - d2)
+                mean_abs = (np.abs(d1) + np.abs(d2)) * 0.5
+                candidates = np.logical_or(mean_abs != 0, abs_diff != 0)
+
+                if max_abs:
+                    candidates[abs_diff <= max_abs] = False
+
+                max_abs_diff = np.max(abs_diff)
+                if np.any(candidates):
+                    rel_diff = abs_diff[candidates] / mean_abs[candidates]
+                    if max_rel:
+                        sub_thr = rel_diff <= max_rel
+                        # Since we operated on sub-selected values already, we need
+                        # to plug them back in
+                        candidates[
+                            tuple((indexes[sub_thr] for indexes in np.where(candidates)))
+                        ] = False
+                    max_rel_diff = np.max(rel_diff)
+                else:
+                    max_rel_diff = 0
+
+                if np.any(candidates):
+
+                    diff_rec = OrderedDict()  # so that abs goes before relative
+
+                    diff_rec['abs'] = max_abs_diff.astype(dtype)
+                    diff_rec['rel'] = max_rel_diff.astype(dtype)
+                    diffs1.append(diff_rec)
+                else:
+                    diffs1.append(None)
+
+            else:
+                diffs1.append({'CMP': "incompat"})
+
+        if any(diffs1):
+
+            diffs['DATA(diff %d:)' % (i + 1)] = diffs1
+
+    return diffs
+
+
 def display_diff(files, diff):
     """Format header differences into a nice string
 
@@ -140,21 +267,27 @@ def display_diff(files, diff):
     """
     output = ""
     field_width = "{:<15}"
+    filename_width = "{:<53}"
     value_width = "{:<55}"
 
     output += "These files are different.\n"
-    output += field_width.format('Field')
+    output += field_width.format('Field/File')
 
-    for f in files:
-        output += value_width.format(os.path.basename(f))
+    for i, f in enumerate(files, 1):
+        output += "%d:%s" % (i, filename_width.format(os.path.basename(f)))
 
     output += "\n"
 
     for key, value in diff.items():
         output += field_width.format(key)
 
         for item in value:
-            item_str = str(item)
+            if isinstance(item, dict):
+                item_str = ', '.join('%s: %s' % i for i in item.items())
+            elif item is None:
+                item_str = '-'
+            else:
+                item_str = str(item)
             # Value might start/end with some invisible spacing characters so we
             # would "condition" it on both ends a bit
             item_str = re.sub('^[ \t]+', '<', item_str)
@@ -169,8 +302,39 @@ def display_diff(files, diff):
     return output
 
 
+def diff(files, header_fields='all', data_max_abs_diff=None, data_max_rel_diff=None,
+         dtype=np.float64):
+    assert len(files) >= 2, "Please enter at least two files"
+
+    file_headers = [nib.load(f).header for f in files]
+
+    # signals "all fields"
+    if header_fields == 'all':
+        # TODO: header fields might vary across file types, thus prior sensing would be needed
+        header_fields = file_headers[0].keys()
+    else:
+        header_fields = header_fields.split(',')
+
+    diff = get_headers_diff(file_headers, header_fields)
+
+    data_md5_diffs = get_data_hash_diff(files, dtype)
+    if data_md5_diffs:
+        # provide details, possibly triggering the ignore of the difference
+        # in data
+        data_diffs = get_data_diff(files,
+                                   max_abs=data_max_abs_diff,
+                                   max_rel=data_max_rel_diff,
+                                   dtype=dtype)
+        if data_diffs:
+            diff['DATA(md5)'] = data_md5_diffs
+            diff.update(data_diffs)
+
+    return diff
+
+
 def main(args=None, out=None):
     """Getting the show on the road"""
+
     out = out or sys.stdout
     parser = get_opt_parser()
     (opts, files) = parser.parse_args(args)
@@ -181,27 +345,17 @@ def main(args=None, out=None):
         # suppress nibabel format-compliance warnings
         nib.imageglobals.logger.level = 50
 
-    assert len(files) >= 2, "Please enter at least two files"
-
-    file_headers = [nib.load(f).header for f in files]
-
-    # signals "all fields"
-    if opts.header_fields == 'all':
-        # TODO: header fields might vary across file types, thus prior sensing would be needed
-        header_fields = file_headers[0].keys()
-    else:
-        header_fields = opts.header_fields.split(',')
-
-    diff = get_headers_diff(file_headers, header_fields)
-    data_diff = get_data_diff(files)
-
-    if data_diff:
-        diff['DATA(md5)'] = data_diff
+    files_diff = diff(
+        files,
+        header_fields=opts.header_fields,
+        data_max_abs_diff=opts.data_max_abs_diff,
+        data_max_rel_diff=opts.data_max_rel_diff,
+        dtype=opts.dtype
+    )
 
-    if diff:
-        out.write(display_diff(files, diff))
+    if files_diff:
+        out.write(display_diff(files, files_diff))
         raise SystemExit(1)
-
     else:
         out.write("These files are identical.\n")
         raise SystemExit(0)
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,3 @@ @@
     -r requirements.txt
     nose
     mock
-    hypothesis