File tree 2 files changed +14
-3
lines changed
python/pyspark/sql/pandas 2 files changed +14
-3
lines changed Original file line number Diff line number Diff line change @@ -34,7 +34,7 @@ class PandasConversionMixin(object):
34
34
"""
35
35
36
36
@since (1.3 )
37
- def toPandas (self ):
37
+ def toPandas (self , self_destruct = False ):
38
38
"""
39
39
Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
40
40
@@ -103,10 +103,18 @@ def toPandas(self):
103
103
batches = self .toDF (* tmp_column_names )._collect_as_arrow ()
104
104
if len (batches ) > 0 :
105
105
table = pyarrow .Table .from_batches (batches )
106
+ del batches
106
107
# Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
107
108
# values, but we should use datetime.date to match the behavior with when
108
109
# Arrow optimization is disabled.
109
- pdf = table .to_pandas (date_as_object = True )
110
+ pandas_options = {'date_as_object' : True }
111
+ if self_destruct :
112
+ pandas_options .update ({
113
+ 'self_destruct' : True ,
114
+ 'split_blocks' : True ,
115
+ 'use_threads' : False ,
116
+ })
117
+ pdf = table .to_pandas (** pandas_options )
110
118
# Rename back to the original column names.
111
119
pdf .columns = self .columns
112
120
for field in self .schema :
Original file line number Diff line number Diff line change @@ -90,7 +90,10 @@ def load_stream(self, stream):
90
90
import pyarrow as pa
91
91
reader = pa .ipc .open_stream (stream )
92
92
for batch in reader :
93
- yield batch
93
+ split_batch = pa .RecordBatch .from_arrays ([
94
+ pa .concat_arrays ([array ]) for array in batch
95
+ ], schema = batch .schema )
96
+ yield split_batch
94
97
95
98
def __repr__ (self ):
96
99
return "ArrowStreamSerializer"
You can’t perform that action at this time.
0 commit comments