16
16
# under the License.
17
17
# pylint:disable=redefined-outer-name
18
18
19
+
20
+ import uuid
21
+ from pathlib import PosixPath
19
22
from typing import (
20
23
Dict ,
21
24
List ,
40
43
NoSuchTableError ,
41
44
TableAlreadyExistsError ,
42
45
)
43
- from pyiceberg .io import load_file_io
46
+ from pyiceberg .io import WAREHOUSE , load_file_io
44
47
from pyiceberg .partitioning import UNPARTITIONED_PARTITION_SPEC , PartitionField , PartitionSpec
45
48
from pyiceberg .schema import Schema
46
49
from pyiceberg .table import (
53
56
TableIdentifier ,
54
57
update_table_metadata ,
55
58
)
56
- from pyiceberg .table .metadata import TableMetadataV1
59
+ from pyiceberg .table .metadata import new_table_metadata
57
60
from pyiceberg .table .sorting import UNSORTED_SORT_ORDER , SortOrder
58
61
from pyiceberg .transforms import IdentityTransform
59
62
from pyiceberg .typedef import EMPTY_DICT , Identifier , Properties
60
63
from pyiceberg .types import IntegerType , LongType , NestedField
61
64
65
+ DEFAULT_WAREHOUSE_LOCATION = "file:///tmp/warehouse"
66
+
62
67
63
68
class InMemoryCatalog (Catalog ):
64
- """An in-memory catalog implementation for testing purposes."""
69
+ """
70
+ An in-memory catalog implementation that uses in-memory data-structures to store the namespaces and tables.
71
+
72
+ This is useful for test, demo, and playground but not in production as data is not persisted.
73
+ """
65
74
66
75
__tables : Dict [Identifier , Table ]
67
76
__namespaces : Dict [Identifier , Properties ]
@@ -70,6 +79,7 @@ def __init__(self, name: str, **properties: str) -> None:
70
79
super ().__init__ (name , ** properties )
71
80
self .__tables = {}
72
81
self .__namespaces = {}
82
+ self ._warehouse_location = properties .get (WAREHOUSE , DEFAULT_WAREHOUSE_LOCATION )
73
83
74
84
def create_table (
75
85
self ,
@@ -79,6 +89,7 @@ def create_table(
79
89
partition_spec : PartitionSpec = UNPARTITIONED_PARTITION_SPEC ,
80
90
sort_order : SortOrder = UNSORTED_SORT_ORDER ,
81
91
properties : Properties = EMPTY_DICT ,
92
+ table_uuid : Optional [uuid .UUID ] = None ,
82
93
) -> Table :
83
94
schema : Schema = self ._convert_schema_if_needed (schema ) # type: ignore
84
95
@@ -91,24 +102,26 @@ def create_table(
91
102
if namespace not in self .__namespaces :
92
103
self .__namespaces [namespace ] = {}
93
104
94
- new_location = location or f's3://warehouse/{ "/" .join (identifier )} /data'
95
- metadata = TableMetadataV1 (** {
96
- "format-version" : 1 ,
97
- "table-uuid" : "d20125c8-7284-442c-9aea-15fee620737c" ,
98
- "location" : new_location ,
99
- "last-updated-ms" : 1602638573874 ,
100
- "last-column-id" : schema .highest_field_id ,
101
- "schema" : schema .model_dump (),
102
- "partition-spec" : partition_spec .model_dump ()["fields" ],
103
- "properties" : properties ,
104
- "current-snapshot-id" : - 1 ,
105
- "snapshots" : [{"snapshot-id" : 1925 , "timestamp-ms" : 1602638573822 }],
106
- })
105
+ if not location :
106
+ location = f'{ self ._warehouse_location } /{ "/" .join (identifier )} '
107
+
108
+ metadata_location = self ._get_metadata_location (location = location )
109
+ metadata = new_table_metadata (
110
+ schema = schema ,
111
+ partition_spec = partition_spec ,
112
+ sort_order = sort_order ,
113
+ location = location ,
114
+ properties = properties ,
115
+ table_uuid = table_uuid ,
116
+ )
117
+ io = load_file_io ({** self .properties , ** properties }, location = location )
118
+ self ._write_metadata (metadata , io , metadata_location )
119
+
107
120
table = Table (
108
121
identifier = identifier ,
109
122
metadata = metadata ,
110
- metadata_location = f's3://warehouse/ { "/" . join ( identifier ) } /metadata/metadata.json' ,
111
- io = load_file_io () ,
123
+ metadata_location = metadata_location ,
124
+ io = io ,
112
125
catalog = self ,
113
126
)
114
127
self .__tables [identifier ] = table
@@ -118,14 +131,29 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location:
118
131
raise NotImplementedError
119
132
120
133
def _commit_table (self , table_request : CommitTableRequest ) -> CommitTableResponse :
121
- identifier = tuple (table_request .identifier .namespace .root ) + (table_request .identifier .name ,)
122
- table = self .__tables [identifier ]
123
- table .metadata = update_table_metadata (base_metadata = table .metadata , updates = table_request .updates )
124
-
125
- return CommitTableResponse (
126
- metadata = table .metadata .model_dump (),
127
- metadata_location = table .location (),
134
+ identifier_tuple = self .identifier_to_tuple_without_catalog (
135
+ tuple (table_request .identifier .namespace .root + [table_request .identifier .name ])
128
136
)
137
+ current_table = self .load_table (identifier_tuple )
138
+ base_metadata = current_table .metadata
139
+
140
+ for requirement in table_request .requirements :
141
+ requirement .validate (base_metadata )
142
+
143
+ updated_metadata = update_table_metadata (base_metadata , table_request .updates )
144
+ if updated_metadata == base_metadata :
145
+ # no changes, do nothing
146
+ return CommitTableResponse (metadata = base_metadata , metadata_location = current_table .metadata_location )
147
+
148
+ # write new metadata
149
+ new_metadata_version = self ._parse_metadata_version (current_table .metadata_location ) + 1
150
+ new_metadata_location = self ._get_metadata_location (current_table .metadata .location , new_metadata_version )
151
+ self ._write_metadata (updated_metadata , current_table .io , new_metadata_location )
152
+
153
+ # update table state
154
+ current_table .metadata = updated_metadata
155
+
156
+ return CommitTableResponse (metadata = updated_metadata , metadata_location = new_metadata_location )
129
157
130
158
def load_table (self , identifier : Union [str , Identifier ]) -> Table :
131
159
identifier = self .identifier_to_tuple_without_catalog (identifier )
@@ -160,7 +188,7 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U
160
188
identifier = to_identifier ,
161
189
metadata = table .metadata ,
162
190
metadata_location = table .metadata_location ,
163
- io = load_file_io ( ),
191
+ io = self . _load_file_io ( properties = table . metadata . properties , location = table . metadata_location ),
164
192
catalog = self ,
165
193
)
166
194
return self .__tables [to_identifier ]
@@ -232,8 +260,8 @@ def update_namespace_properties(
232
260
233
261
234
262
@pytest .fixture
235
- def catalog () -> InMemoryCatalog :
236
- return InMemoryCatalog ("test.in.memory. catalog" , ** {"test.key" : "test.value" })
263
+ def catalog (tmp_path : PosixPath ) -> InMemoryCatalog :
264
+ return InMemoryCatalog ("test.in_memory. catalog" , ** {WAREHOUSE : tmp_path . absolute (). as_posix (), "test.key" : "test.value" })
237
265
238
266
239
267
TEST_TABLE_IDENTIFIER = ("com" , "organization" , "department" , "my_table" )
@@ -244,7 +272,6 @@ def catalog() -> InMemoryCatalog:
244
272
NestedField (2 , "y" , LongType (), doc = "comment" ),
245
273
NestedField (3 , "z" , LongType ()),
246
274
)
247
- TEST_TABLE_LOCATION = "protocol://some/location"
248
275
TEST_TABLE_PARTITION_SPEC = PartitionSpec (PartitionField (name = "x" , transform = IdentityTransform (), source_id = 1 , field_id = 1000 ))
249
276
TEST_TABLE_PROPERTIES = {"key1" : "value1" , "key2" : "value2" }
250
277
NO_SUCH_TABLE_ERROR = "Table does not exist: \\ ('com', 'organization', 'department', 'my_table'\\ )"
@@ -261,7 +288,6 @@ def given_catalog_has_a_table(
261
288
return catalog .create_table (
262
289
identifier = TEST_TABLE_IDENTIFIER ,
263
290
schema = TEST_TABLE_SCHEMA ,
264
- location = TEST_TABLE_LOCATION ,
265
291
partition_spec = TEST_TABLE_PARTITION_SPEC ,
266
292
properties = properties or TEST_TABLE_PROPERTIES ,
267
293
)
@@ -307,13 +333,25 @@ def test_create_table(catalog: InMemoryCatalog) -> None:
307
333
table = catalog .create_table (
308
334
identifier = TEST_TABLE_IDENTIFIER ,
309
335
schema = TEST_TABLE_SCHEMA ,
310
- location = TEST_TABLE_LOCATION ,
311
336
partition_spec = TEST_TABLE_PARTITION_SPEC ,
312
337
properties = TEST_TABLE_PROPERTIES ,
313
338
)
314
339
assert catalog .load_table (TEST_TABLE_IDENTIFIER ) == table
315
340
316
341
342
+ def test_create_table_location_override (catalog : InMemoryCatalog ) -> None :
343
+ new_location = f"{ catalog ._warehouse_location } /new_location"
344
+ table = catalog .create_table (
345
+ identifier = TEST_TABLE_IDENTIFIER ,
346
+ schema = TEST_TABLE_SCHEMA ,
347
+ location = new_location ,
348
+ partition_spec = TEST_TABLE_PARTITION_SPEC ,
349
+ properties = TEST_TABLE_PROPERTIES ,
350
+ )
351
+ assert catalog .load_table (TEST_TABLE_IDENTIFIER ) == table
352
+ assert table .location () == new_location
353
+
354
+
317
355
@pytest .mark .parametrize (
318
356
"schema,expected" ,
319
357
[
@@ -335,8 +373,6 @@ def test_create_table_pyarrow_schema(catalog: InMemoryCatalog, pyarrow_schema_si
335
373
table = catalog .create_table (
336
374
identifier = TEST_TABLE_IDENTIFIER ,
337
375
schema = pyarrow_schema_simple_without_ids ,
338
- location = TEST_TABLE_LOCATION ,
339
- partition_spec = TEST_TABLE_PARTITION_SPEC ,
340
376
properties = TEST_TABLE_PROPERTIES ,
341
377
)
342
378
assert catalog .load_table (TEST_TABLE_IDENTIFIER ) == table
@@ -662,7 +698,7 @@ def test_add_column_with_statement(catalog: InMemoryCatalog) -> None:
662
698
663
699
def test_catalog_repr (catalog : InMemoryCatalog ) -> None :
664
700
s = repr (catalog )
665
- assert s == "test.in.memory .catalog (<class 'test_base.InMemoryCatalog'>)"
701
+ assert s == "test.in_memory .catalog (<class 'test_base.InMemoryCatalog'>)"
666
702
667
703
668
704
def test_table_properties_int_value (catalog : InMemoryCatalog ) -> None :
0 commit comments