24
24
import ibis .backends .bigquery as ibis_bigquery
25
25
import ibis .common .deferred # type: ignore
26
26
import ibis .expr .datatypes as ibis_dtypes
27
+ import ibis .expr .operations as ibis_ops
27
28
import ibis .expr .types as ibis_types
28
29
import pandas
29
30
36
37
from bigframes .core .ordering import (
37
38
ascending_over ,
38
39
encode_order_string ,
39
- IntegerEncoding ,
40
40
join_orderings ,
41
41
OrderingExpression ,
42
42
RowOrdering ,
@@ -71,19 +71,16 @@ def __init__(
71
71
# Allow creating a DataFrame directly from an Ibis table expression.
72
72
# TODO(swast): Validate that each column references the same table (or
73
73
# no table for literal values).
74
- self ._columns = tuple (columns )
74
+ self ._columns = tuple (
75
+ column .resolve (table )
76
+ # TODO(https://github.com/ibis-project/ibis/issues/7613): use
77
+ # public API to refer to Deferred type.
78
+ if isinstance (column , ibis .common .deferred .Deferred ) else column
79
+ for column in columns
80
+ )
75
81
# To allow for more efficient lookup by column name, create a
76
82
# dictionary mapping names to column values.
77
- self ._column_names = {
78
- (
79
- column .resolve (table )
80
- # TODO(https://github.com/ibis-project/ibis/issues/7613): use
81
- # public API to refer to Deferred type.
82
- if isinstance (column , ibis .common .deferred .Deferred )
83
- else column
84
- ).get_name (): column
85
- for column in self ._columns
86
- }
83
+ self ._column_names = {column .get_name (): column for column in self ._columns }
87
84
88
85
@property
89
86
def columns (self ) -> typing .Tuple [ibis_types .Value , ...]:
@@ -139,10 +136,6 @@ def projection(
139
136
for expression , id in expression_id_pairs
140
137
]
141
138
result = self ._select (tuple (values )) # type: ignore
142
-
143
- # Need to reproject to convert ibis Scalar to ibis Column object
144
- if any (exp_id [0 ].is_const for exp_id in expression_id_pairs ):
145
- result = result ._reproject_to_table ()
146
139
return result
147
140
148
141
@abc .abstractmethod
@@ -300,8 +293,6 @@ def _to_ibis_expr(
300
293
ArrayValue objects are sorted, so the following options are available
301
294
to reflect this in the ibis expression.
302
295
303
- * "offset_col": Zero-based offsets are generated as a column, this will
304
- not sort the rows however.
305
296
* "string_encoded": An ordered string column is provided in output table.
306
297
* "unordered": No ordering information will be provided in output. Only
307
298
value columns are projected.
@@ -355,6 +346,10 @@ def _to_ibis_expr(
355
346
return table
356
347
357
348
def filter (self , predicate : ex .Expression ) -> UnorderedIR :
349
+ if any (map (is_window , map (self ._get_ibis_column , predicate .unbound_variables ))):
350
+ # ibis doesn't support qualify syntax, so create CTE if filtering over window expression
351
+ # https://github.com/ibis-project/ibis/issues/9775
352
+ return self ._reproject_to_table ().filter (predicate )
358
353
bindings = {col : self ._get_ibis_column (col ) for col in self .column_ids }
359
354
condition = op_compiler .compile_expression (predicate , bindings )
360
355
return self ._filter (condition )
@@ -785,15 +780,33 @@ def promote_offsets(self, col_id: str) -> OrderedIR:
785
780
"""
786
781
# Special case: offsets already exist
787
782
ordering = self ._ordering
783
+ # Case 1, already have offsets, just create column from them
784
+ if ordering .is_sequential and (ordering .total_order_col is not None ):
785
+ expr_builder = self .builder ()
786
+ expr_builder .columns = [
787
+ self ._compile_expression (
788
+ ordering .total_order_col .scalar_expression
789
+ ).name (col_id ),
790
+ * self .columns ,
791
+ ]
792
+ return expr_builder .build ()
793
+ # Cannot nest analytic expressions, so reproject to cte first if needed.
794
+ # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql)
795
+ # Seee: https://github.com/ibis-project/ibis/issues/9773
796
+ can_directly_window = not any (
797
+ map (lambda x : is_literal (x ) or is_window (x ), self ._ibis_order )
798
+ )
799
+ if not can_directly_window :
800
+ return self ._reproject_to_table ().promote_offsets (col_id )
788
801
789
- if (not ordering .is_sequential ) or (not ordering .total_order_col ):
790
- return self ._project_offsets ().promote_offsets (col_id )
802
+ window = ibis .window (order_by = self ._ibis_order )
803
+ if self ._predicates :
804
+ window = window .group_by (self ._reduced_predicate )
805
+ offsets = ibis .row_number ().over (window )
791
806
expr_builder = self .builder ()
792
807
expr_builder .columns = [
793
- self ._compile_expression (ordering .total_order_col .scalar_expression ).name (
794
- col_id
795
- ),
796
808
* self .columns ,
809
+ offsets .name (col_id ),
797
810
]
798
811
return expr_builder .build ()
799
812
@@ -806,7 +819,6 @@ def project_window_op(
806
819
output_name = None ,
807
820
* ,
808
821
never_skip_nulls = False ,
809
- skip_reproject_unsafe : bool = False ,
810
822
) -> OrderedIR :
811
823
"""
812
824
Creates a new expression based on this expression with unary operation applied to one column.
@@ -815,8 +827,25 @@ def project_window_op(
815
827
window_spec: a specification of the window over which to apply the operator
816
828
output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided
817
829
never_skip_nulls: will disable null skipping for operators that would otherwise do so
818
- skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection
819
830
"""
831
+ # Cannot nest analytic expressions, so reproject to cte first if needed.
832
+ # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql)
833
+ # See: https://github.com/ibis-project/ibis/issues/9773
834
+ used_exprs = map (
835
+ self ._get_any_column , [column_name , * window_spec .all_referenced_columns ]
836
+ )
837
+ can_directly_window = not any (
838
+ map (lambda x : is_literal (x ) or is_window (x ), used_exprs )
839
+ )
840
+ if not can_directly_window :
841
+ return self ._reproject_to_table ().project_window_op (
842
+ column_name ,
843
+ op ,
844
+ window_spec ,
845
+ output_name ,
846
+ never_skip_nulls = never_skip_nulls ,
847
+ )
848
+
820
849
column = typing .cast (ibis_types .Column , self ._get_ibis_column (column_name ))
821
850
window = self ._ibis_window_from_spec (
822
851
window_spec , require_total_order = op .uses_total_row_ordering
@@ -861,8 +890,7 @@ def project_window_op(
861
890
window_op = case_statement
862
891
863
892
result = self ._set_or_replace_by_id (output_name or column_name , window_op )
864
- # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation.
865
- return result ._reproject_to_table () if not skip_reproject_unsafe else result
893
+ return result
866
894
867
895
def _reproject_to_table (self ) -> OrderedIR :
868
896
table = self ._to_ibis_expr (
@@ -944,7 +972,7 @@ def _to_ibis_expr(
944
972
expose_hidden_cols : bool = False ,
945
973
fraction : Optional [float ] = None ,
946
974
col_id_overrides : typing .Mapping [str , str ] = {},
947
- ordering_mode : Literal ["string_encoded" , "offset_col" , " unordered" ],
975
+ ordering_mode : Literal ["string_encoded" , "unordered" ],
948
976
order_col_name : Optional [str ] = ORDER_ID_COLUMN ,
949
977
):
950
978
"""
@@ -953,8 +981,7 @@ def _to_ibis_expr(
953
981
ArrayValue objects are sorted, so the following options are available
954
982
to reflect this in the ibis expression.
955
983
956
- * "offset_col": Zero-based offsets are generated as a column, this will
957
- not sort the rows however.
984
+
958
985
* "string_encoded": An ordered string column is provided in output table.
959
986
* "unordered": No ordering information will be provided in output. Only
960
987
value columns are projected.
@@ -981,10 +1008,9 @@ def _to_ibis_expr(
981
1008
"""
982
1009
assert ordering_mode in (
983
1010
"string_encoded" ,
984
- "offset_col" ,
985
1011
"unordered" ,
986
1012
)
987
- if expose_hidden_cols and ordering_mode in ("ordered_col" , "offset_col" ):
1013
+ if expose_hidden_cols and ordering_mode in ("ordered_col" ):
988
1014
raise ValueError (
989
1015
f"Cannot expose hidden ordering columns with ordering_mode { ordering_mode } "
990
1016
)
@@ -1034,6 +1060,10 @@ def _to_ibis_expr(
1034
1060
return table
1035
1061
1036
1062
def filter (self , predicate : ex .Expression ) -> OrderedIR :
1063
+ if any (map (is_window , map (self ._get_ibis_column , predicate .unbound_variables ))):
1064
+ # ibis doesn't support qualify syntax, so create CTE if filtering over window expression
1065
+ # https://github.com/ibis-project/ibis/issues/9775
1066
+ return self ._reproject_to_table ().filter (predicate )
1037
1067
bindings = {col : self ._get_ibis_column (col ) for col in self .column_ids }
1038
1068
condition = op_compiler .compile_expression (predicate , bindings )
1039
1069
return self ._filter (condition )
@@ -1174,37 +1204,14 @@ def _bake_ordering(self) -> OrderedIR:
1174
1204
predicates = self ._predicates ,
1175
1205
)
1176
1206
1177
- def _project_offsets (self ) -> OrderedIR :
1178
- """Create a new expression that contains offsets. Should only be executed when
1179
- offsets are needed for an operations. Has no effect on expression semantics."""
1180
- if self ._ordering .is_sequential :
1181
- return self
1182
- table = self ._to_ibis_expr (
1183
- ordering_mode = "offset_col" , order_col_name = ORDER_ID_COLUMN
1184
- )
1185
- columns = [table [column_name ] for column_name in self ._column_names ]
1186
- ordering = TotalOrdering (
1187
- ordering_value_columns = tuple ([ascending_over (ORDER_ID_COLUMN )]),
1188
- total_ordering_columns = frozenset ([ORDER_ID_COLUMN ]),
1189
- integer_encoding = IntegerEncoding (True , is_sequential = True ),
1190
- )
1191
- return OrderedIR (
1192
- table ,
1193
- columns = columns ,
1194
- hidden_ordering_columns = [table [ORDER_ID_COLUMN ]],
1195
- ordering = ordering ,
1196
- )
1197
-
1198
1207
def _create_order_columns (
1199
1208
self ,
1200
1209
ordering_mode : str ,
1201
1210
order_col_name : Optional [str ],
1202
1211
expose_hidden_cols : bool ,
1203
1212
) -> typing .Sequence [ibis_types .Value ]:
1204
1213
# Generate offsets if current ordering id semantics are not sufficiently strict
1205
- if ordering_mode == "offset_col" :
1206
- return (self ._create_offset_column ().name (order_col_name ),)
1207
- elif ordering_mode == "string_encoded" :
1214
+ if ordering_mode == "string_encoded" :
1208
1215
return (self ._create_string_ordering_column ().name (order_col_name ),)
1209
1216
elif expose_hidden_cols :
1210
1217
return self ._hidden_ordering_columns
@@ -1328,6 +1335,22 @@ def build(self) -> OrderedIR:
1328
1335
)
1329
1336
1330
1337
1338
+ def is_literal (column : ibis_types .Value ) -> bool :
1339
+ # Unfortunately, Literals in ibis are not "Columns"s and therefore can't be aggregated.
1340
+ return not isinstance (column , ibis_types .Column )
1341
+
1342
+
1343
+ def is_window (column : ibis_types .Value ) -> bool :
1344
+ matches = (
1345
+ (column )
1346
+ .op ()
1347
+ .find_topmost (
1348
+ lambda x : isinstance (x , (ibis_ops .WindowFunction , ibis_ops .Relation ))
1349
+ )
1350
+ )
1351
+ return any (isinstance (op , ibis_ops .WindowFunction ) for op in matches )
1352
+
1353
+
1331
1354
def _reduce_predicate_list (
1332
1355
predicate_list : typing .Collection [ibis_types .BooleanValue ],
1333
1356
) -> ibis_types .BooleanValue :
0 commit comments