Skip to content

Commit 0a6e8d9

Browse files
committed
Cast to correct dtype in empty dataframes.
1 parent b67003f commit 0a6e8d9

File tree

3 files changed

+42
-7
lines changed

3 files changed

+42
-7
lines changed

pandas_gbq/gbq.py

+39-4
Original file line numberDiff line numberDiff line change
@@ -482,8 +482,12 @@ def run_query(self, query, **kwargs):
482482
self.process_http_error(ex)
483483

484484
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
485-
dtypes = _bqschema_to_dtypes(schema_fields)
486-
df = rows_iter.to_dataframe(dtypes=dtypes)
485+
nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
486+
df = rows_iter.to_dataframe(dtypes=nullsafe_dtypes)
487+
488+
if df.empty:
489+
df = _cast_empty_df_dtypes(schema_fields, df)
490+
487491
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
488492
return df
489493

@@ -633,11 +637,11 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
633637
table.create(table_id, table_schema)
634638

635639

636-
def _bqschema_to_dtypes(schema_fields):
640+
def _bqschema_to_nullsafe_dtypes(schema_fields):
637641
# Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
638642
# default dtype choice.
639643
#
640-
# see:
644+
# See:
641645
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
642646
# #missing-data-casting-rules-and-indexing
643647
dtype_map = {
@@ -661,6 +665,37 @@ def _bqschema_to_dtypes(schema_fields):
661665
return dtypes
662666

663667

668+
def _cast_empty_df_dtypes(schema_fields, df):
669+
"""Cast any columns in an empty dataframe to correct type.
670+
671+
In an empty dataframe, pandas cannot choose a dtype unless one is
672+
explicitly provided. The _bqschema_to_nullsafe_dtypes() function only
673+
provides dtypes when the dtype safely handles null values. This means
674+
that empty int64 and boolean columns are incorrectly classified as
675+
``object``.
676+
"""
677+
if not df.empty:
678+
raise ValueError(
679+
"DataFrame must be empty in order to cast non-nullsafe dtypes"
680+
)
681+
682+
dtype_map = {
683+
"BOOLEAN": bool,
684+
"INTEGER": np.int64,
685+
}
686+
687+
for field in schema_fields:
688+
column = str(field["name"])
689+
if field["mode"].upper() == "REPEATED":
690+
continue
691+
692+
dtype = dtype_map.get(field["type"].upper())
693+
if dtype:
694+
df[column] = df[column].astype(dtype)
695+
696+
return df
697+
698+
664699
def read_gbq(
665700
query,
666701
project_id=None,

tests/system/test_gbq.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -590,8 +590,8 @@ def test_zero_rows(self, project_id):
590590
)
591591
empty_columns = {
592592
"title": pandas.Series([], dtype=object),
593-
"id": pandas.Series([], dtype=object),
594-
"is_bot": pandas.Series([], dtype=object),
593+
"id": pandas.Series([], dtype=np.dtype(int)),
594+
"is_bot": pandas.Series([], dtype=np.dtype(bool)),
595595
"ts": pandas.Series([], dtype="datetime64[ns, UTC]"),
596596
}
597597
expected_result = DataFrame(empty_columns)

tests/unit/test_gbq.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def no_auth(monkeypatch):
7676
],
7777
)
7878
def test_should_return_bigquery_correctly_typed(type_, expected):
79-
result = gbq._bqschema_to_dtypes(
79+
result = gbq._bqschema_to_nullsafe_dtypes(
8080
[dict(name="x", type=type_, mode="NULLABLE")]
8181
)
8282
if not expected:

0 commit comments

Comments
 (0)