-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
[ArrayManager] Implement concat with axis=1 (merge/join) #39841
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
072ed39
dd14d2d
4da4433
31cfb0d
480b32d
3488d63
437c1d2
2597cd5
31d1305
b6a6b54
3f5fc38
54692ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,45 @@ | |
from pandas import Index | ||
|
||
|
||
def concatenate_array_managers( | ||
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool | ||
) -> Manager: | ||
""" | ||
Concatenate array managers into one. | ||
|
||
Parameters | ||
---------- | ||
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples | ||
axes : list of Index | ||
concat_axis : int | ||
copy : bool | ||
|
||
Returns | ||
------- | ||
ArrayManager | ||
""" | ||
# reindex all arrays | ||
mgrs = [] | ||
for mgr, indexers in mgrs_indexers: | ||
for ax, indexer in indexers.items(): | ||
mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True) | ||
mgrs.append(mgr) | ||
|
||
if concat_axis == 1: | ||
# concatting along the rows -> concat the reindexed arrays | ||
# TODO(ArrayManager) doesn't yet preserve the correct dtype | ||
arrays = [ | ||
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))]) | ||
for j in range(len(mgrs[0].arrays)) | ||
] | ||
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) | ||
else: | ||
# concatting along the columns -> combine reindexed arrays in a single manager | ||
assert concat_axis == 0 | ||
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) | ||
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) | ||
|
||
|
||
def concatenate_block_managers( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you at a higher level call concatenate_array_managers instead? (e.g. instead of dispatching thru the BM) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would require adding if/else checking the manager type in each of the places where currently If you want I can create a new There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok maybe rename this then to concatenate_managers? to make it much more obvious. |
||
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool | ||
) -> Manager: | ||
|
@@ -66,20 +105,9 @@ def concatenate_block_managers( | |
------- | ||
BlockManager | ||
""" | ||
# TODO(ArrayManager) this assumes that all managers are of the same type | ||
if isinstance(mgrs_indexers[0][0], ArrayManager): | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if concat_axis == 1: | ||
# TODO for now only fastpath without indexers | ||
mgrs = [t[0] for t in mgrs_indexers] | ||
arrays = [ | ||
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) | ||
for j in range(len(mgrs[0].arrays)) | ||
] | ||
return ArrayManager(arrays, [axes[1], axes[0]]) | ||
elif concat_axis == 0: | ||
mgrs = [t[0] for t in mgrs_indexers] | ||
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) | ||
return ArrayManager(arrays, [axes[1], axes[0]]) | ||
return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) | ||
|
||
concat_plans = [ | ||
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -282,6 +282,18 @@ def get_dtypes(self): | |
dtypes = np.array([blk.dtype for blk in self.blocks]) | ||
return algos.take_nd(dtypes, self.blknos, allow_fill=False) | ||
|
||
@property | ||
def arrays(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. annotate |
||
""" | ||
Quick access to the backing arrays of the Blocks. | ||
|
||
Only for compatibility with ArrayManager for testing convenience. | ||
Not to be used in actual code, and return value is not the same as the | ||
ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). | ||
""" | ||
for blk in self.blocks: | ||
yield blk.values | ||
|
||
def __getstate__(self): | ||
block_values = [b.values for b in self.blocks] | ||
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -287,17 +287,27 @@ def test_merge_copy(self): | |
merged["d"] = "peekaboo" | ||
assert (right["d"] == "bar").all() | ||
|
||
def test_merge_nocopy(self): | ||
def test_merge_nocopy(self, using_array_manager): | ||
left = DataFrame({"a": 0, "b": 1}, index=range(10)) | ||
right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) | ||
|
||
merged = merge(left, right, left_index=True, right_index=True, copy=False) | ||
|
||
merged["a"] = 6 | ||
assert (left["a"] == 6).all() | ||
if using_array_manager: | ||
# With ArrayManager, setting a column doesn't change the values inplace | ||
# and thus does not propagate the changes to the original left/right | ||
# dataframes -> need to check that no copy was made in a different way | ||
# TODO(ArrayManager) we should be able to simplify this with a .loc | ||
# setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10 | ||
# but this currently replaces the array (_setitem_with_indexer_split_path) | ||
assert merged._mgr.arrays[0] is left._mgr.arrays[0] | ||
assert merged._mgr.arrays[2] is right._mgr.arrays[0] | ||
else: | ||
merged["a"] = 6 | ||
assert (left["a"] == 6).all() | ||
|
||
merged["d"] = "peekaboo" | ||
assert (right["d"] == "peekaboo").all() | ||
merged["d"] = "peekaboo" | ||
assert (right["d"] == "peekaboo").all() | ||
|
||
def test_intelligently_handle_join_key(self): | ||
# #733, be a bit more 1337 about not returning unconsolidated DataFrame | ||
|
@@ -1381,7 +1391,9 @@ def test_merge_readonly(self): | |
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] | ||
) | ||
|
||
data1._mgr.blocks[0].values.flags.writeable = False | ||
for arr in data1._mgr.arrays: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cabn you comment on this (I know what you are doing by seeing what you added, but is non-obvious here i think) |
||
arr.flags.writeable = False | ||
|
||
data1.merge(data2) # no error | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only
reshape/merge
and notreshape/concat
, as there are too many tests still failing in the concat tests (because axis=0 is not yet handled properly)