13
13
14
14
try :
15
15
import sqlalchemy
16
- except ImportError :
16
+ except ImportError : # pragma: no cover
17
17
sqlalchemy = None
18
18
19
19
from dask_sql .input_utils .base import BaseInputPlugin
@@ -35,9 +35,7 @@ def is_correct_input(
35
35
36
36
return is_sqlalchemy_hive or is_hive_cursor or format == "hive"
37
37
38
- def to_dc (
39
- self , input_item : Any , table_name : str , format : str = None , ** kwargs
40
- ): # pragma: no cover
38
+ def to_dc (self , input_item : Any , table_name : str , format : str = None , ** kwargs ):
41
39
table_name = kwargs .pop ("hive_table_name" , table_name )
42
40
schema = kwargs .pop ("hive_schema_name" , "default" )
43
41
@@ -65,14 +63,16 @@ def to_dc(
65
63
if "InputFormat" in storage_information :
66
64
format = storage_information ["InputFormat" ].split ("." )[- 1 ]
67
65
# databricks format is different, see https://github.com/dask-contrib/dask-sql/issues/83
68
- elif "InputFormat" in table_information :
66
+ elif "InputFormat" in table_information : # pragma: no cover
69
67
format = table_information ["InputFormat" ].split ("." )[- 1 ]
70
- else :
68
+ else : # pragma: no cover
71
69
raise RuntimeError (
72
70
"Do not understand the output of 'DESCRIBE FORMATTED <table>'"
73
71
)
74
72
75
- if format == "TextInputFormat" or format == "SequenceFileInputFormat" :
73
+ if (
74
+ format == "TextInputFormat" or format == "SequenceFileInputFormat"
75
+ ): # pragma: no cover
76
76
storage_description = storage_information .get ("Storage Desc Params" , {})
77
77
read_function = partial (
78
78
dd .read_csv ,
@@ -81,15 +81,17 @@ def to_dc(
81
81
)
82
82
elif format == "ParquetInputFormat" or format == "MapredParquetInputFormat" :
83
83
read_function = dd .read_parquet
84
- elif format == "OrcInputFormat" :
84
+ elif format == "OrcInputFormat" : # pragma: no cover
85
85
read_function = dd .read_orc
86
- elif format == "JsonInputFormat" :
86
+ elif format == "JsonInputFormat" : # pragma: no cover
87
87
read_function = dd .read_json
88
- else :
88
+ else : # pragma: no cover
89
89
raise AttributeError (f"Do not understand hive's table format { format } " )
90
90
91
91
def _normalize (loc ):
92
- if loc .startswith ("dbfs:/" ) and not loc .startswith ("dbfs://" ):
92
+ if loc .startswith ("dbfs:/" ) and not loc .startswith (
93
+ "dbfs://"
94
+ ): # pragma: no cover
93
95
# dask (or better: fsspec) needs to have the URL in a specific form
94
96
# starting with two // after the protocol
95
97
loc = f"dbfs://{ loc .lstrip ('dbfs:' )} "
@@ -102,6 +104,19 @@ def _normalize(loc):
102
104
def wrapped_read_function (location , column_information , ** kwargs ):
103
105
location = _normalize (location )
104
106
logger .debug (f"Reading in hive data from { location } " )
107
+ if format == "ParquetInputFormat" or format == "MapredParquetInputFormat" :
108
+ # Hack needed for parquet files.
109
+ # If the folder structure is like .../col=3/...
110
+ # parquet wants to read in the partition information.
111
+ # However, we add the partition information by ourself
112
+ # which will lead to problems afterwards
113
+ # Therefore tell parquet to only read in the columns
114
+ # we actually care right now
115
+ kwargs .setdefault ("columns" , list (column_information .keys ()))
116
+ else : # pragma: no cover
117
+ # prevent python to optimize it away and make coverage not respect the
118
+ # pragma
119
+ dummy = 0
105
120
df = read_function (location , ** kwargs )
106
121
107
122
logger .debug (f"Applying column information: { column_information } " )
@@ -165,7 +180,7 @@ def _parse_hive_table_description(
165
180
schema : str ,
166
181
table_name : str ,
167
182
partition : str = None ,
168
- ): # pragma: no cover
183
+ ):
169
184
"""
170
185
Extract all information from the output
171
186
of the DESCRIBE FORMATTED call, which is unfortunately
@@ -207,7 +222,7 @@ def _parse_hive_table_description(
207
222
elif key == "# Partition Information" :
208
223
mode = "partition"
209
224
elif key .startswith ("#" ):
210
- mode = None
225
+ mode = None # pragma: no cover
211
226
elif key :
212
227
if not value :
213
228
value = dict ()
@@ -223,6 +238,10 @@ def _parse_hive_table_description(
223
238
elif mode == "partition" :
224
239
partition_information [key ] = value
225
240
last_field = partition_information [key ]
241
+ else : # pragma: no cover
242
+ # prevent python to optimize it away and make coverage not respect the
243
+ # pragma
244
+ dummy = 0
226
245
elif value and last_field is not None :
227
246
last_field [value ] = value2
228
247
@@ -238,7 +257,7 @@ def _parse_hive_partition_description(
238
257
cursor : Union ["sqlalchemy.engine.base.Connection" , "hive.Cursor" ],
239
258
schema : str ,
240
259
table_name : str ,
241
- ): # pragma: no cover
260
+ ):
242
261
"""
243
262
Extract all partition informaton for a given table
244
263
"""
@@ -251,7 +270,7 @@ def _fetch_all_results(
251
270
self ,
252
271
cursor : Union ["sqlalchemy.engine.base.Connection" , "hive.Cursor" ],
253
272
sql : str ,
254
- ): # pragma: no cover
273
+ ):
255
274
"""
256
275
The pyhive.Cursor and the sqlalchemy connection behave slightly different.
257
276
The former has the fetchall method on the cursor,
@@ -261,5 +280,5 @@ def _fetch_all_results(
261
280
262
281
try :
263
282
return result .fetchall ()
264
- except AttributeError :
283
+ except AttributeError : # pragma: no cover
265
284
return cursor .fetchall ()
0 commit comments