Json normalize nan support (#25619)

antoineviscardi · jreback · commit dcf7fce19a66 · 2019-03-13T13:10:36.000-04:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -215,10 +215,10 @@ I/O
 - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`)
 - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`)
 - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`)
+- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
 - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
 - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
 -
--
 
 
 Plotting
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
@@ -281,6 +281,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
             raise ValueError('Conflicting metadata name {name}, '
                              'need distinguishing prefix '.format(name=k))
 
-        result[k] = np.array(v).repeat(lengths)
+        # forcing dtype to object to avoid the metadata being casted to string
+        result[k] = np.array(v, dtype=object).repeat(lengths)
 
     return result
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
@@ -66,6 +66,25 @@ def author_missing_data():
          }]
 
 
+@pytest.fixture
+def missing_metadata():
+    return [
+        {'name': 'Alice',
+         'addresses': [{'number': 9562,
+                        'street': 'Morris St.',
+                        'city': 'Massillon',
+                        'state': 'OH',
+                        'zip': 44646}]
+         },
+        {'addresses': [{'number': 8449,
+                        'street': 'Spring St.',
+                        'city': 'Elizabethton',
+                        'state': 'TN',
+                        'zip': 37643}]
+         }
+    ]
+
+
 class TestJSONNormalize(object):
 
     def test_simple_records(self):
@@ -318,66 +337,51 @@ def test_nested_flattens(self):
 
         assert result == expected
 
-    def test_json_normalize_errors(self):
-        # GH14583: If meta keys are not always present
-        # a new option to set errors='ignore' has been implemented
-        i = {
-            "Trades": [{
-                "general": {
-                    "tradeid": 100,
-                    "trade_version": 1,
-                    "stocks": [{
-
-                        "symbol": "AAPL",
-                        "name": "Apple",
-                        "price": "0"
-                    }, {
-                        "symbol": "GOOG",
-                        "name": "Google",
-                        "price": "0"
-                    }
-                    ]
-                }
-            }, {
-                "general": {
-                    "tradeid": 100,
-                    "stocks": [{
-                        "symbol": "AAPL",
-                        "name": "Apple",
-                        "price": "0"
-                    }, {
-                        "symbol": "GOOG",
-                        "name": "Google",
-                        "price": "0"
-                    }
-                    ]
-                }
-            }
-            ]
-        }
-        j = json_normalize(data=i['Trades'],
-                           record_path=[['general', 'stocks']],
-                           meta=[['general', 'tradeid'],
-                                 ['general', 'trade_version']],
-                           errors='ignore')
-        expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
-                    'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
-                    'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
-                    'price': {0: '0', 1: '0', 2: '0', 3: '0'},
-                    'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
-
-        assert j.fillna('').to_dict() == expected
-
-        msg = ("Try running with errors='ignore' as key 'trade_version'"
+    def test_json_normalize_errors(self, missing_metadata):
+        # GH14583:
+        # If meta keys are not always present a new option to set
+        # errors='ignore' has been implemented
+
+        msg = ("Try running with errors='ignore' as key 'name'"
                " is not always present")
         with pytest.raises(KeyError, match=msg):
             json_normalize(
-                data=i['Trades'],
-                record_path=[['general', 'stocks']],
-                meta=[['general', 'tradeid'],
-                      ['general', 'trade_version']],
+                data=missing_metadata,
+                record_path='addresses',
+                meta='name',
                 errors='raise')
 
+    def test_missing_meta(self, missing_metadata):
+        # GH25468
+        # If metadata is nullable with errors set to ignore, the null values
+        # should be numpy.nan values
+        result = json_normalize(
+            data=missing_metadata,
+            record_path='addresses',
+            meta='name',
+            errors='ignore')
+        ex_data = [
+            {'city': 'Massillon',
+             'number': 9562,
+             'state': 'OH',
+             'street': 'Morris St.',
+             'zip': 44646,
+             'name': 'Alice'},
+            {'city': 'Elizabethton',
+             'number': 8449,
+             'state': 'TN',
+             'street': 'Spring St.',
+             'zip': 37643,
+             'name': np.nan}
+        ]
+        ex_data = [
+            ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'],
+            ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan]
+        ]
+        columns = ['city', 'number', 'state', 'street', 'zip', 'name']
+        expected = DataFrame(ex_data, columns=columns)
+        tm.assert_frame_equal(result, expected)
+
     def test_donot_drop_nonevalues(self):
         # GH21356
         data = [