Skip to content

Commit 6368a8b

Browse files
committed
#683 Add support for '_' for hierarchical key generation at leaf level.
1 parent 4872933 commit 6368a8b

File tree

6 files changed

+247
-5
lines changed

6 files changed

+247
-5
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,6 +1244,23 @@ val df = spark
12441244
.load("examples/multisegment_data/COMP.DETAILS.SEP30.DATA.dat")
12451245
```
12461246

1247+
Sometimes, the leaf level has many segments. In this case, you can use `_` as the list of segment ids to specify
1248+
'the rest of segment ids', like this:
1249+
1250+
```scala
1251+
val df = spark
1252+
.read
1253+
.format("cobol")
1254+
.option("copybook_contents", copybook)
1255+
.option("record_format", "V")
1256+
.option("segment_field", "SEGMENT_ID")
1257+
.option("segment_id_level0", "C")
1258+
.option("segment_id_level1", "_")
1259+
.load("examples/multisegment_data/COMP.DETAILS.SEP30.DATA.dat")
1260+
```
1261+
1262+
The result of both above code snippets is the same.
1263+
12471264
The resulting table will look like this:
12481265
```
12491266
df.show(10)

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/iterator/SegmentIdAccumulator.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@
1616

1717
package za.co.absa.cobrix.cobol.reader.iterator
1818

19+
import za.co.absa.cobrix.cobol.reader.parameters.ParameterParsingUtils
20+
1921
final class SegmentIdAccumulator (segmentIds: scala.collection.Seq[String], segmentIdPrefix: String, val fileId: Int) {
20-
private val segmentIdsArr = segmentIds.toArray.map(_.split(","))
22+
private val segmentIdsArr = ParameterParsingUtils.splitSegmentIds(segmentIds)
23+
2124
private val segmentIdCount = segmentIds.size
2225
private val segmentIdAccumulator = new Array[Long](segmentIdCount + 1)
2326
private var currentLevel = -1
@@ -77,7 +80,7 @@ final class SegmentIdAccumulator (segmentIds: scala.collection.Seq[String], segm
7780
var level: Option[Int] = None
7881
var i = 0
7982
while (level.isEmpty && i<segmentIdCount) {
80-
if (segmentIdsArr(i).contains(id))
83+
if (segmentIdsArr(i).contains(id) || segmentIdsArr(i).contains("_"))
8184
level = Some(i)
8285
i += 1
8386
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.cobol.reader.parameters
18+
19+
object ParameterParsingUtils {
20+
/** Splits segment ids defined in spark-cobol options for hierarchical id generation. */
21+
def splitSegmentIds(segmentIdsToSplit: scala.collection.Seq[String]): Array[Array[String]] = {
22+
segmentIdsToSplit.toArray
23+
.map{ ids =>
24+
ids.split(',')
25+
.map(_.trim())
26+
.map(id => if (id == "*") "_" else id)
27+
}
28+
}
29+
30+
/** Validates segment ids for hierarchical record id generation. */
31+
def validateSegmentIds(segmentIds: Array[Array[String]]): Unit = {
32+
val maxLevel = segmentIds.length - 1
33+
segmentIds.zipWithIndex.foreach {
34+
case (ids, level) =>
35+
if (ids.contains("_") && level < maxLevel)
36+
throw new IllegalArgumentException(s"The '_' as a segment id can only be used on the leaf level (segment_id_level$maxLevel), found at 'segment_id_level$level'")
37+
if (ids.contains("*") && level < maxLevel)
38+
throw new IllegalArgumentException(s"The '*' as a segment id can only be used on the leaf level (segment_id_level$maxLevel), found at 'segment_id_level$level'")
39+
if ((ids.contains("*") || ids.contains("_")) && ids.length > 1)
40+
throw new IllegalArgumentException(s"The '*' or '_' as a segment id cannot be used with other ids 'segment_id_level$level = ${ids.mkString(",")}' is incorrect.")
41+
}
42+
43+
}
44+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.cobol.reader.parameters
18+
19+
import org.scalatest.wordspec.AnyWordSpec
20+
21+
class ParameterParsingUtilsSuite extends AnyWordSpec {
22+
"splitSegmentIds()" should {
23+
"split input segment ids" in {
24+
val segmentIds = Seq("A,B,C", "D,E,F")
25+
26+
val actual = ParameterParsingUtils.splitSegmentIds(segmentIds)
27+
28+
assert(actual.length == 2)
29+
assert(actual(0).sameElements(Array("A", "B", "C")))
30+
assert(actual(1).sameElements(Array("D", "E", "F")))
31+
}
32+
33+
"trim if split with spaces" in {
34+
val segmentIds = Seq("A, B, C", "D, E, F")
35+
36+
val actual = ParameterParsingUtils.splitSegmentIds(segmentIds)
37+
38+
assert(actual.length == 2)
39+
assert(actual(0).sameElements(Array("A", "B", "C")))
40+
assert(actual(1).sameElements(Array("D", "E", "F")))
41+
}
42+
43+
"handle empty strings" in {
44+
val segmentIds = Seq("", "")
45+
46+
val actual = ParameterParsingUtils.splitSegmentIds(segmentIds)
47+
48+
assert(actual.length == 2)
49+
assert(actual(0).head == "")
50+
assert(actual(1).head == "")
51+
}
52+
53+
}
54+
55+
56+
"validateSegmentIds()" should {
57+
"validate segment ids" in {
58+
val segmentIds = Array(
59+
Array("A", "B", "C"),
60+
Array("D", "E", "F")
61+
)
62+
63+
ParameterParsingUtils.validateSegmentIds(segmentIds)
64+
}
65+
66+
"throw an exception if '_' is used on the wrong level" in {
67+
val segmentIds = Array(
68+
Array("_"),
69+
Array("A", "B", "C")
70+
)
71+
72+
val ex = intercept[IllegalArgumentException] {
73+
ParameterParsingUtils.validateSegmentIds(segmentIds)
74+
}
75+
76+
assert(ex.getMessage.contains("The '_' as a segment id can only be used on the leaf level (segment_id_level1), found at 'segment_id_level0'"))
77+
}
78+
79+
"throw an exception if '*' is used on the wrong level" in {
80+
val segmentIds = Array(
81+
Array("A"),
82+
Array("B"),
83+
Array("*"),
84+
Array("C")
85+
)
86+
87+
val ex = intercept[IllegalArgumentException] {
88+
ParameterParsingUtils.validateSegmentIds(segmentIds)
89+
}
90+
91+
assert(ex.getMessage.contains("The '*' as a segment id can only be used on the leaf level (segment_id_level3), found at 'segment_id_level2'"))
92+
}
93+
94+
"throw an exception if '*' or '_' is used with other ids" in {
95+
val segmentIds = Array(
96+
Array("A", "B", "C"),
97+
Array("D", "*", "F", "G")
98+
)
99+
100+
val ex = intercept[IllegalArgumentException] {
101+
ParameterParsingUtils.validateSegmentIds(segmentIds)
102+
}
103+
104+
assert(ex.getMessage.contains("'*' or '_' as a segment id cannot be used with other ids"))
105+
}
106+
}
107+
108+
}

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -551,15 +551,19 @@ object CobolParametersParser extends Logging {
551551
private def parseMultisegmentParameters(params: Parameters): Option[MultisegmentParameters] = {
552552
if (params.contains(PARAM_SEGMENT_FIELD)) {
553553
val levels = parseSegmentLevels(params)
554-
Some(MultisegmentParameters
555-
(
554+
val multiseg = MultisegmentParameters(
556555
params(PARAM_SEGMENT_FIELD),
557556
params.get(PARAM_SEGMENT_FILTER).map(_.split(',')),
558557
levels,
559558
params.getOrElse(PARAM_SEGMENT_ID_PREFIX, ""),
560559
getSegmentIdRedefineMapping(params),
561560
getSegmentRedefineParents(params)
562-
))
561+
)
562+
563+
val segmentIds = ParameterParsingUtils.splitSegmentIds(multiseg.segmentLevelIds)
564+
ParameterParsingUtils.validateSegmentIds(segmentIds)
565+
566+
Some(multiseg)
563567
}
564568
else {
565569
None

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test17HierarchicalSpec.scala

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,72 @@ class Test17HierarchicalSpec extends AnyWordSpec with SparkTestBase with CobolTe
113113

114114
testData(actualDf, actualResultsPath, expectedResultsPath)
115115
}
116+
117+
"return a dataframe with ids generated when _ notation is used" in {
118+
val df = spark
119+
.read
120+
.format("cobol")
121+
.option("copybook", inputCopybookPath)
122+
.option("pedantic", "true")
123+
.option("record_format", "V")
124+
.option("generate_record_id", "true")
125+
.option("schema_retention_policy", "collapse_root")
126+
.option("segment_field", "SEGMENT_ID")
127+
.option("segment_id_level0", "1")
128+
.option("segment_id_level1", "2,5")
129+
.option("segment_id_level2", "_")
130+
.option("segment_id_prefix", "A")
131+
.option("redefine_segment_id_map:1", "COMPANY => 1")
132+
.option("redefine-segment-id-map:2", "DEPT => 2")
133+
.option("redefine-segment-id-map:3", "EMPLOYEE => 3")
134+
.option("redefine-segment-id-map:4", "OFFICE => 4")
135+
.option("redefine-segment-id-map:5", "CUSTOMER => 5")
136+
.option("redefine-segment-id-map:6", "CONTACT => 6")
137+
.option("redefine-segment-id-map:7", "CONTRACT => 7")
138+
.load(inputDataPath)
139+
140+
testSchema(df, actualSchemaPath, expectedSchemaPath)
141+
142+
val actualDf = df
143+
.orderBy("File_Id", "Record_Id")
144+
.toJSON
145+
.take(300)
146+
147+
testData(actualDf, actualResultsPath, expectedResultsPath)
148+
}
149+
150+
"return a dataframe with ids generated when * notation is used" in {
151+
val df = spark
152+
.read
153+
.format("cobol")
154+
.option("copybook", inputCopybookPath)
155+
.option("pedantic", "true")
156+
.option("record_format", "V")
157+
.option("generate_record_id", "true")
158+
.option("schema_retention_policy", "collapse_root")
159+
.option("segment_field", "SEGMENT_ID")
160+
.option("segment_id_level0", "1")
161+
.option("segment_id_level1", "2,5")
162+
.option("segment_id_level2", "*")
163+
.option("segment_id_prefix", "A")
164+
.option("redefine_segment_id_map:1", "COMPANY => 1")
165+
.option("redefine-segment-id-map:2", "DEPT => 2")
166+
.option("redefine-segment-id-map:3", "EMPLOYEE => 3")
167+
.option("redefine-segment-id-map:4", "OFFICE => 4")
168+
.option("redefine-segment-id-map:5", "CUSTOMER => 5")
169+
.option("redefine-segment-id-map:6", "CONTACT => 6")
170+
.option("redefine-segment-id-map:7", "CONTRACT => 7")
171+
.load(inputDataPath)
172+
173+
testSchema(df, actualSchemaPath, expectedSchemaPath)
174+
175+
val actualDf = df
176+
.orderBy("File_Id", "Record_Id")
177+
.toJSON
178+
.take(300)
179+
180+
testData(actualDf, actualResultsPath, expectedResultsPath)
181+
}
116182
}
117183

118184
"read as a hierarchical file with parent child relationships defined" should {

0 commit comments

Comments
 (0)