Skip to content

Commit 3000f94

Browse files
cloud-fanMaxGekk
authored andcommitted
[SPARK-50767][SQL] Remove codegen of from_json
### What changes were proposed in this pull request? This reopens #49411 to fix the performance regression in 4.0. ### Why are the changes needed? It's non-trivial to support CSE for Filter in whole stage codegen. We should not rush but revert the codegen support in 4.0 so that we have more time to get it right in 4.1. Note: 4.0 also adds codegen support for a few other expressions, but `from_json` is special as it's quite expensive and the performance regression is very significant with it. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? N/A ### Was this patch authored or co-authored using generative AI tooling? no Closes #49992 from cloud-fan/json. Authored-by: Wenchen Fan <[email protected]> Signed-off-by: Max Gekk <[email protected]> (cherry picked from commit a8b694f) Signed-off-by: Max Gekk <[email protected]>
1 parent a6ad0d9 commit 3000f94

File tree

3 files changed

+23
-39
lines changed

3 files changed

+23
-39
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
2020
import org.apache.spark.sql.catalyst.InternalRow
2121
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
2222
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
23-
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
23+
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, CodegenFallback, ExprCode}
2424
import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
2525
import org.apache.spark.sql.catalyst.expressions.json.{GetJsonObjectEvaluator, JsonExpressionUtils, JsonToStructsEvaluator, JsonTupleEvaluator, SchemaOfJsonEvaluator, StructsToJsonEvaluator}
2626
import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke}
@@ -257,6 +257,7 @@ case class JsonToStructs(
257257
variantAllowDuplicateKeys: Boolean = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS))
258258
extends UnaryExpression
259259
with TimeZoneAwareExpression
260+
with CodegenFallback
260261
with ExpectsInputTypes
261262
with QueryErrorsBase {
262263

@@ -304,31 +305,14 @@ case class JsonToStructs(
304305
copy(timeZoneId = Option(timeZoneId))
305306

306307
@transient
307-
private val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
308+
private lazy val nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
308309

309310
@transient
310311
private lazy val evaluator = new JsonToStructsEvaluator(
311312
options, nullableSchema, nameOfCorruptRecord, timeZoneId, variantAllowDuplicateKeys)
312313

313314
override def nullSafeEval(json: Any): Any = evaluator.evaluate(json.asInstanceOf[UTF8String])
314315

315-
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
316-
val refEvaluator = ctx.addReferenceObj("evaluator", evaluator)
317-
val eval = child.genCode(ctx)
318-
val resultType = CodeGenerator.boxedType(dataType)
319-
val resultTerm = ctx.freshName("result")
320-
ev.copy(code =
321-
code"""
322-
|${eval.code}
323-
|$resultType $resultTerm = ($resultType) $refEvaluator.evaluate(${eval.value});
324-
|boolean ${ev.isNull} = $resultTerm == null;
325-
|${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};
326-
|if (!${ev.isNull}) {
327-
| ${ev.value} = $resultTerm;
328-
|}
329-
|""".stripMargin)
330-
}
331-
332316
override def inputTypes: Seq[AbstractDataType] =
333317
StringTypeWithCollation(supportsTrimCollation = true) :: Nil
334318

sql/core/benchmarks/SubExprEliminationBenchmark-jdk21-results.txt

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination
33
================================================================================================
44

55
Preparing data for benchmarking ...
6-
OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
6+
OpenJDK 64-Bit Server VM 21.0.6+7-LTS on Linux 6.8.0-1021-azure
77
AMD EPYC 7763 64-Core Processor
88
from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
99
------------------------------------------------------------------------------------------------------------------------
10-
subExprElimination false, codegen: true 6313 6431 120 0.0 63134831.3 1.0X
11-
subExprElimination false, codegen: false 6093 6348 288 0.0 60930747.6 1.0X
12-
subExprElimination true, codegen: true 1387 1425 33 0.0 13872525.5 4.6X
13-
subExprElimination true, codegen: false 1218 1332 99 0.0 12182992.7 5.2X
10+
subExprElimination false, codegen: true 6700 7047 301 0.0 67001649.1 1.0X
11+
subExprElimination false, codegen: false 6719 6837 118 0.0 67191470.6 1.0X
12+
subExprElimination true, codegen: true 1350 1489 122 0.0 13503842.8 5.0X
13+
subExprElimination true, codegen: false 1366 1444 96 0.0 13658823.9 4.9X
1414

1515
Preparing data for benchmarking ...
16-
OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
16+
OpenJDK 64-Bit Server VM 21.0.6+7-LTS on Linux 6.8.0-1021-azure
1717
AMD EPYC 7763 64-Core Processor
1818
from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
1919
------------------------------------------------------------------------------------------------------------------------
20-
subExprElimination false, codegen: true 6610 6705 85 0.0 66104698.4 1.0X
21-
subExprElimination false, codegen: false 6647 6730 76 0.0 66469463.5 1.0X
22-
subExprElimination true, codegen: true 2077 2126 43 0.0 20769220.1 3.2X
23-
subExprElimination true, codegen: false 1949 2000 64 0.0 19489004.0 3.4X
20+
subExprElimination false, codegen: true 7250 7520 384 0.0 72501549.6 1.0X
21+
subExprElimination false, codegen: false 7255 7366 114 0.0 72554716.3 1.0X
22+
subExprElimination true, codegen: true 1934 2024 79 0.0 19344228.2 3.7X
23+
subExprElimination true, codegen: false 1981 2015 29 0.0 19814306.4 3.7X
2424

2525

sql/core/benchmarks/SubExprEliminationBenchmark-results.txt

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,23 @@ Benchmark for performance of subexpression elimination
33
================================================================================================
44

55
Preparing data for benchmarking ...
6-
OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
6+
OpenJDK 64-Bit Server VM 17.0.14+7-LTS on Linux 6.8.0-1021-azure
77
AMD EPYC 7763 64-Core Processor
88
from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
99
------------------------------------------------------------------------------------------------------------------------
10-
subExprElimination false, codegen: true 6438 6551 98 0.0 64378783.5 1.0X
11-
subExprElimination false, codegen: false 6216 6320 175 0.0 62161826.1 1.0X
12-
subExprElimination true, codegen: true 1480 1518 39 0.0 14799890.8 4.3X
13-
subExprElimination true, codegen: false 1321 1429 94 0.0 13212919.6 4.9X
10+
subExprElimination false, codegen: true 6389 6498 173 0.0 63887225.6 1.0X
11+
subExprElimination false, codegen: false 6235 6292 81 0.0 62351284.2 1.0X
12+
subExprElimination true, codegen: true 1328 1368 47 0.0 13284825.9 4.8X
13+
subExprElimination true, codegen: false 1323 1368 73 0.0 13227629.0 4.8X
1414

1515
Preparing data for benchmarking ...
16-
OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
16+
OpenJDK 64-Bit Server VM 17.0.14+7-LTS on Linux 6.8.0-1021-azure
1717
AMD EPYC 7763 64-Core Processor
1818
from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
1919
------------------------------------------------------------------------------------------------------------------------
20-
subExprElimination false, codegen: true 7107 7310 207 0.0 71066752.8 1.0X
21-
subExprElimination false, codegen: false 6738 6781 41 0.0 67375897.0 1.1X
22-
subExprElimination true, codegen: true 2052 2110 51 0.0 20519152.3 3.5X
23-
subExprElimination true, codegen: false 2053 2079 33 0.0 20526629.8 3.5X
20+
subExprElimination false, codegen: true 7081 7177 86 0.0 70813603.9 1.0X
21+
subExprElimination false, codegen: false 6586 6720 139 0.0 65859888.8 1.1X
22+
subExprElimination true, codegen: true 1729 1827 117 0.0 17291697.7 4.1X
23+
subExprElimination true, codegen: false 1726 1789 57 0.0 17255779.7 4.1X
2424

2525

0 commit comments

Comments
 (0)