Skip to content

Commit bd39414

Browse files
committed
CLN: Use to_dataframe to download query results.
This allows us to remove logic for parsing the schema and align with google-cloud-bigquery.
1 parent b0254c4 commit bd39414

File tree

4 files changed

+37
-34
lines changed

4 files changed

+37
-34
lines changed

benchmark/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# pandas-gbq benchmarks
2+
3+
This directory contains a few scripts which are useful for performance
4+
testing the pandas-gbq library. Use cProfile to time the script and see
5+
details about where time is spent. To avoid timing how long BigQuery takes to
6+
execute a query, run the benchmark twice to ensure the results are cached.
7+
8+
## `read_gbq`
9+
10+
Read a small table (a few KB).
11+
12+
python -m cProfile --sort=cumtime read_gbq_small_results.py
13+
14+
Read a large-ish table (100+ MB).
15+
16+
python -m cProfile --sort=cumtime read_gbq_large_results.py

benchmark/read_gbq_large_results.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
import pandas_gbq
3+
4+
# Select 163 MB worth of data, to time how long it takes to download large
5+
# result sets.
6+
df = pandas_gbq.read_gbq(
7+
"SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
8+
dialect="standard")

benchmark/read_gbq_small_results.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
2+
import pandas_gbq
3+
4+
# Select a few KB worth of data, to time downloading small result sets.
5+
df = pandas_gbq.read_gbq(
6+
"SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
7+
dialect="standard")

pandas_gbq/gbq.py

+6-34
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
import logging
2-
import os
32
import time
43
import warnings
5-
from collections import OrderedDict
64
from datetime import datetime
75

86
import numpy as np
9-
from pandas import DataFrame
107

118
from pandas_gbq.exceptions import AccessDenied
129

@@ -69,7 +66,7 @@ def _check_google_client_version():
6966
def _test_google_api_imports():
7067

7168
try:
72-
import pydata_google_auth
69+
import pydata_google_auth # noqa
7370
except ImportError as ex:
7471
raise ImportError(
7572
"pandas-gbq requires pydata-google-auth: {0}".format(ex)
@@ -483,15 +480,9 @@ def run_query(self, query, **kwargs):
483480
rows_iter = query_reply.result()
484481
except self.http_error as ex:
485482
self.process_http_error(ex)
486-
result_rows = list(rows_iter)
487-
total_rows = rows_iter.total_rows
488-
schema = {
489-
"fields": [field.to_api_repr() for field in rows_iter.schema]
490-
}
491-
492-
logger.debug("Got {} rows.\n".format(total_rows))
493-
494-
return schema, result_rows
483+
df = rows_iter.to_dataframe()
484+
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
485+
return df
495486

496487
def load_data(
497488
self,
@@ -662,25 +653,6 @@ def _parse_schema(schema_fields):
662653
yield name, dtype
663654

664655

665-
def _parse_data(schema, rows):
666-
667-
column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
668-
df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
669-
670-
for column in df:
671-
dtype = column_dtypes[column]
672-
null_safe = (
673-
df[column].notnull().all()
674-
or dtype == float
675-
or dtype == "datetime64[ns]"
676-
)
677-
if dtype and null_safe:
678-
df[column] = df[column].astype(
679-
column_dtypes[column], errors="ignore"
680-
)
681-
return df
682-
683-
684656
def read_gbq(
685657
query,
686658
project_id=None,
@@ -833,8 +805,8 @@ def read_gbq(
833805
credentials=credentials,
834806
private_key=private_key,
835807
)
836-
schema, rows = connector.run_query(query, configuration=configuration)
837-
final_df = _parse_data(schema, rows)
808+
809+
final_df = connector.run_query(query, configuration=configuration)
838810

839811
# Reindex the DataFrame on the provided column
840812
if index_col is not None:

0 commit comments

Comments
 (0)