Cast to correct dtype in empty dataframes.

tswast · tswast · commit 0a6e8d91c341 · 2019-02-12T09:53:25.000-08:00
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -482,8 +482,12 @@ def run_query(self, query, **kwargs):
             self.process_http_error(ex)
 
         schema_fields = [field.to_api_repr() for field in rows_iter.schema]
-        dtypes = _bqschema_to_dtypes(schema_fields)
-        df = rows_iter.to_dataframe(dtypes=dtypes)
+        nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
+        df = rows_iter.to_dataframe(dtypes=nullsafe_dtypes)
+
+        if df.empty:
+            df = _cast_empty_df_dtypes(schema_fields, df)
+
         logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
         return df
 
@@ -633,11 +637,11 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
         table.create(table_id, table_schema)
 
 
-def _bqschema_to_dtypes(schema_fields):
+def _bqschema_to_nullsafe_dtypes(schema_fields):
     # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
     # default dtype choice.
     #
-    # see:
+    # See:
     # http://pandas.pydata.org/pandas-docs/dev/missing_data.html
     # #missing-data-casting-rules-and-indexing
     dtype_map = {
@@ -661,6 +665,37 @@ def _bqschema_to_dtypes(schema_fields):
     return dtypes
 
 
+def _cast_empty_df_dtypes(schema_fields, df):
+    """Cast any columns in an empty dataframe to correct type.
+
+    In an empty dataframe, pandas cannot choose a dtype unless one is
+    explicitly provided. The _bqschema_to_nullsafe_dtypes() function only
+    provides dtypes when the dtype safely handles null values. This means
+    that empty int64 and boolean columns are incorrectly classified as
+    ``object``.
+    """
+    if not df.empty:
+        raise ValueError(
+            "DataFrame must be empty in order to cast non-nullsafe dtypes"
+        )
+
+    dtype_map = {
+        "BOOLEAN": bool,
+        "INTEGER": np.int64,
+    }
+
+    for field in schema_fields:
+        column = str(field["name"])
+        if field["mode"].upper() == "REPEATED":
+            continue
+
+        dtype = dtype_map.get(field["type"].upper())
+        if dtype:
+            df[column] = df[column].astype(dtype)
+
+    return df
+
+
 def read_gbq(
     query,
     project_id=None,
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -590,8 +590,8 @@ def test_zero_rows(self, project_id):
         )
         empty_columns = {
             "title": pandas.Series([], dtype=object),
-            "id": pandas.Series([], dtype=object),
-            "is_bot": pandas.Series([], dtype=object),
+            "id": pandas.Series([], dtype=np.dtype(int)),
+            "is_bot": pandas.Series([], dtype=np.dtype(bool)),
             "ts": pandas.Series([], dtype="datetime64[ns, UTC]"),
         }
         expected_result = DataFrame(empty_columns)
diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py
@@ -76,7 +76,7 @@ def no_auth(monkeypatch):
     ],
 )
 def test_should_return_bigquery_correctly_typed(type_, expected):
-    result = gbq._bqschema_to_dtypes(
+    result = gbq._bqschema_to_nullsafe_dtypes(
         [dict(name="x", type=type_, mode="NULLABLE")]
     )
     if not expected:

Original file line number	Diff line number	Diff line change
`@@ -590,8 +590,8 @@ def test_zero_rows(self, project_id):`
`590`	`590`	`)`
`591`	`591`	`empty_columns = {`
`592`	`592`	`"title": pandas.Series([], dtype=object),`
`593`		`- "id": pandas.Series([], dtype=object),`
`594`		`- "is_bot": pandas.Series([], dtype=object),`
	`593`	`+ "id": pandas.Series([], dtype=np.dtype(int)),`
	`594`	`+ "is_bot": pandas.Series([], dtype=np.dtype(bool)),`
`595`	`595`	`"ts": pandas.Series([], dtype="datetime64[ns, UTC]"),`
`596`	`596`	`}`
`597`	`597`	`expected_result = DataFrame(empty_columns)`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def no_auth(monkeypatch):`
`76`	`76`	`],`
`77`	`77`	`)`
`78`	`78`	`def test_should_return_bigquery_correctly_typed(type_, expected):`
`79`		`- result = gbq._bqschema_to_dtypes(`
	`79`	`+ result = gbq._bqschema_to_nullsafe_dtypes(`
`80`	`80`	`[dict(name="x", type=type_, mode="NULLABLE")]`
`81`	`81`	`)`
`82`	`82`	`if not expected:`