Skip to content

Incorrect Parquet Projection For Nested Types #2453

Closed
@tustvold

Description

@tustvold

Describe the bug

use datafusion::prelude::{ParquetReadOptions, SessionContext};

#[tokio::test]
async fn temp() {
    let ctx = SessionContext::new();

    ctx.register_parquet("patient", "part-00000-f6337bce-7fcd-4021-9f9d-040413ea83f8-c000.snappy.parquet",
                         ParquetReadOptions::default()).await.unwrap();

    let df = ctx.sql("SELECT patient.meta FROM patient LIMIT 10").await.unwrap();
    df.show().await.unwrap();
}

Where part-00000-f6337bce-7fcd-4021-9f9d-040413ea83f8-c000.snappy.parquet is the parquet file provided by @kesavkolla in #2439

This fails with

called `Result::unwrap()` on an `Err` value: ArrowError(ExternalError(ArrowError(InvalidArgumentError("column types must match schema types, expected Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"extension\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"versionId\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"lastUpdated\", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"source\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"profile\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"security\", data_type: List(Field { name: \"element\", data_type: Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"extension\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"system\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"version\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"code\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"display\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"userSelected\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"tag\", data_type: List(Field { name: \"element\", data_type: Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"extension\", data_type: List(Field { name: \"element\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"system\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"version\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"code\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"display\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, Field { name: \"userSelected\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]) but found Struct([Field { name: \"id\", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }]) at column index 0"))))

The problem arises because ParquetExec is passing the projection indices for the arrow schema to get_record_reader_by_columns which instead expects parquet column indexes. In the presence of nested types, these are not the same thing.

This is further complicated by apache/arrow-rs#1652 and apache/arrow-rs#1651

To Reproduce

Run the code above

Expected behavior

The code should not error

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions