Closed
Description
Describe the bug
use datafusion::prelude::{ParquetReadOptions, SessionContext};
#[tokio::test]
async fn temp() {
let ctx = SessionContext::new();
ctx.register_parquet("patient", "part-00000-f6337bce-7fcd-4021-9f9d-040413ea83f8-c000.snappy.parquet",
ParquetReadOptions::default()).await.unwrap();
let df = ctx.sql("SELECT patient.meta FROM patient LIMIT 10").await.unwrap();
df.show().await.unwrap();
}
Where part-00000-f6337bce-7fcd-4021-9f9d-040413ea83f8-c000.snappy.parquet is the parquet file provided by @kesavkolla in #2439
This fails with
called `Result::unwrap()` on an `Err` value: ArrowError(ExternalError(ArrowError(InvalidArgumentError("column types must match schema types, expected Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"extension\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"versionId\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"lastUpdated\", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"source\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"profile\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"security\", data_type: List(Field { name: \"element\", data_type: Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"extension\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"system\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"version\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"code\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"display\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"userSelected\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"tag\", data_type: List(Field { name: \"element\", data_type: Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"extension\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"system\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"version\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"code\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"display\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"userSelected\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]) but found Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]) at column index 0"))))
The problem arises because ParquetExec is passing the projection indices for the arrow schema to get_record_reader_by_columns which instead expects parquet column indexes. In the presence of nested types, these are not the same thing.
This is further complicated by apache/arrow-rs#1652 and apache/arrow-rs#1651
To Reproduce
Run the code above
Expected behavior
The code should not error