Revert "[SPARK-38531][SQL] Fix the condition of "Prune unrequired child index" branch of ColumnPruning"

HyukjinKwon · HyukjinKwon · commit 161c596cafea · 2022-07-05T18:01:32.000+09:00
This reverts commit 4b93435.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -314,25 +314,6 @@ object NestedColumnAliasing {
   }
 }
 
-object GeneratorUnrequiredChildrenPruning {
-  def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match {
-    case p @ Project(_, g: Generate) =>
-      val requiredAttrs = p.references ++ g.generator.references
-      val newChild = ColumnPruning.prunedChild(g.child, requiredAttrs)
-      val unrequired = g.generator.references -- p.references
-      val unrequiredIndices = newChild.output.zipWithIndex.filter(t => unrequired.contains(t._1))
-        .map(_._2)
-      if (!newChild.fastEquals(g.child) ||
-        unrequiredIndices.toSet != g.unrequiredChildIndex.toSet) {
-        Some(p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices)))
-      } else {
-        None
-      }
-    case _ => None
-  }
-}
-
-
 /**
  * This prunes unnecessary nested columns from [[Generate]], or [[Project]] -> [[Generate]]
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -862,12 +862,13 @@ object ColumnPruning extends Rule[LogicalPlan] {
       e.copy(child = prunedChild(child, e.references))
 
     // prune unrequired references
-    // There are 2 types of pruning here:
-    // 1. For attributes in g.child.outputSet that is not used by the generator nor the project,
-    //    we directly remove it from the output list of g.child.
-    // 2. For attributes that is not used by the project but it is used by the generator, we put
-    //    it in g.unrequiredChildIndex to save memory usage.
-    case GeneratorUnrequiredChildrenPruning(rewrittenPlan) => rewrittenPlan
+    case p @ Project(_, g: Generate) if p.references != g.outputSet =>
+      val requiredAttrs = p.references -- g.producedAttributes ++ g.generator.references
+      val newChild = prunedChild(g.child, requiredAttrs)
+      val unrequired = g.generator.references -- p.references
+      val unrequiredIndices = newChild.output.zipWithIndex.filter(t => unrequired.contains(t._1))
+        .map(_._2)
+      p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices))
 
     // prune unrequired nested fields from `Generate`.
     case GeneratorNestedColumnAliasing(rewrittenPlan) => rewrittenPlan
@@ -928,7 +929,7 @@ object ColumnPruning extends Rule[LogicalPlan] {
   })
 
   /** Applies a projection only when the child is producing unnecessary attributes */
-  def prunedChild(c: LogicalPlan, allReferences: AttributeSet): LogicalPlan =
+  private def prunedChild(c: LogicalPlan, allReferences: AttributeSet) =
     if (!c.outputSet.subsetOf(allReferences)) {
       Project(c.output.filter(allReferences.contains), c)
     } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.optimizer.NestedColumnAliasingSuite.collectGeneratedAliases
 import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -465,37 +464,6 @@ class ColumnPruningSuite extends PlanTest {
     comparePlans(Optimize.execute(plan1.analyze), correctAnswer1)
   }
 
-  test("SPARK-38531: Nested field pruning for Project and PosExplode") {
-    val name = StructType.fromDDL("first string, middle string, last string")
-    val employer = StructType.fromDDL("id int, company struct<name:string, address:string>")
-    val contact = LocalRelation(
-      'id.int,
-      'name.struct(name),
-      'address.string,
-      'friends.array(name),
-      'relatives.map(StringType, name),
-      'employer.struct(employer))
-
-    val query = contact
-      .select('id, 'friends)
-      .generate(PosExplode('friends))
-      .select('col.getField("middle"))
-      .analyze
-    val optimized = Optimize.execute(query)
-
-    val aliases = collectGeneratedAliases(optimized)
-
-    val expected = contact
-      // GetStructField is pushed down, unused id column is pruned.
-      .select(
-        'friends.getField("middle").as(aliases(0)))
-      .generate(PosExplode($"${aliases(0)}"),
-        unrequiredChildIndex = Seq(0)) // unrequiredChildIndex is added.
-      .select('col.as("col.middle"))
-      .analyze
-    comparePlans(optimized, expected)
-  }
-
   test("SPARK-39445: Remove the window if windowExpressions is empty in column pruning") {
     object CustomOptimize extends RuleExecutor[LogicalPlan] {
       val batches = Batch("Column pruning", FixedPoint(10),