Skip to content

Commit dcf7fce

Browse files
antoineviscardijreback
authored andcommitted
Json normalize nan support (#25619)
1 parent 69ae24b commit dcf7fce

File tree

3 files changed

+62
-57
lines changed

3 files changed

+62
-57
lines changed

doc/source/whatsnew/v0.25.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,10 @@ I/O
215215
- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`)
216216
- Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`)
217217
- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`)
218+
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
218219
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
219220
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
220221
-
221-
-
222222

223223

224224
Plotting

pandas/io/json/normalize.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
281281
raise ValueError('Conflicting metadata name {name}, '
282282
'need distinguishing prefix '.format(name=k))
283283

284-
result[k] = np.array(v).repeat(lengths)
284+
# forcing dtype to object to avoid the metadata being casted to string
285+
result[k] = np.array(v, dtype=object).repeat(lengths)
285286

286287
return result

pandas/tests/io/json/test_normalize.py

+59-55
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,25 @@ def author_missing_data():
6666
}]
6767

6868

69+
@pytest.fixture
70+
def missing_metadata():
71+
return [
72+
{'name': 'Alice',
73+
'addresses': [{'number': 9562,
74+
'street': 'Morris St.',
75+
'city': 'Massillon',
76+
'state': 'OH',
77+
'zip': 44646}]
78+
},
79+
{'addresses': [{'number': 8449,
80+
'street': 'Spring St.',
81+
'city': 'Elizabethton',
82+
'state': 'TN',
83+
'zip': 37643}]
84+
}
85+
]
86+
87+
6988
class TestJSONNormalize(object):
7089

7190
def test_simple_records(self):
@@ -318,66 +337,51 @@ def test_nested_flattens(self):
318337

319338
assert result == expected
320339

321-
def test_json_normalize_errors(self):
322-
# GH14583: If meta keys are not always present
323-
# a new option to set errors='ignore' has been implemented
324-
i = {
325-
"Trades": [{
326-
"general": {
327-
"tradeid": 100,
328-
"trade_version": 1,
329-
"stocks": [{
330-
331-
"symbol": "AAPL",
332-
"name": "Apple",
333-
"price": "0"
334-
}, {
335-
"symbol": "GOOG",
336-
"name": "Google",
337-
"price": "0"
338-
}
339-
]
340-
}
341-
}, {
342-
"general": {
343-
"tradeid": 100,
344-
"stocks": [{
345-
"symbol": "AAPL",
346-
"name": "Apple",
347-
"price": "0"
348-
}, {
349-
"symbol": "GOOG",
350-
"name": "Google",
351-
"price": "0"
352-
}
353-
]
354-
}
355-
}
356-
]
357-
}
358-
j = json_normalize(data=i['Trades'],
359-
record_path=[['general', 'stocks']],
360-
meta=[['general', 'tradeid'],
361-
['general', 'trade_version']],
362-
errors='ignore')
363-
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
364-
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
365-
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
366-
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
367-
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
368-
369-
assert j.fillna('').to_dict() == expected
370-
371-
msg = ("Try running with errors='ignore' as key 'trade_version'"
340+
def test_json_normalize_errors(self, missing_metadata):
341+
# GH14583:
342+
# If meta keys are not always present a new option to set
343+
# errors='ignore' has been implemented
344+
345+
msg = ("Try running with errors='ignore' as key 'name'"
372346
" is not always present")
373347
with pytest.raises(KeyError, match=msg):
374348
json_normalize(
375-
data=i['Trades'],
376-
record_path=[['general', 'stocks']],
377-
meta=[['general', 'tradeid'],
378-
['general', 'trade_version']],
349+
data=missing_metadata,
350+
record_path='addresses',
351+
meta='name',
379352
errors='raise')
380353

354+
def test_missing_meta(self, missing_metadata):
355+
# GH25468
356+
# If metadata is nullable with errors set to ignore, the null values
357+
# should be numpy.nan values
358+
result = json_normalize(
359+
data=missing_metadata,
360+
record_path='addresses',
361+
meta='name',
362+
errors='ignore')
363+
ex_data = [
364+
{'city': 'Massillon',
365+
'number': 9562,
366+
'state': 'OH',
367+
'street': 'Morris St.',
368+
'zip': 44646,
369+
'name': 'Alice'},
370+
{'city': 'Elizabethton',
371+
'number': 8449,
372+
'state': 'TN',
373+
'street': 'Spring St.',
374+
'zip': 37643,
375+
'name': np.nan}
376+
]
377+
ex_data = [
378+
['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'],
379+
['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan]
380+
]
381+
columns = ['city', 'number', 'state', 'street', 'zip', 'name']
382+
expected = DataFrame(ex_data, columns=columns)
383+
tm.assert_frame_equal(result, expected)
384+
381385
def test_donot_drop_nonevalues(self):
382386
# GH21356
383387
data = [

0 commit comments

Comments
 (0)