Skip to content

Commit 9bbe41d

Browse files
[DLP] Implemented dlp_deidentify_table_condition_masking (#9521)
* Rebase - 1 * [dlp_deidentify_table_condition_masking] Formatted * [dlp_deidentify_table_condition_masking] Formatted * [dlp_deidentify_table_condition_masking] Formatted * Refactored * Rebase - 1 * [dlp_deidentify_table_condition_masking] Formatted * [dlp_deidentify_table_condition_masking] Formatted * [dlp_deidentify_table_condition_masking] Formatted * Refactored --------- Co-authored-by: Karl Weinmeister <[email protected]>
1 parent 4ab86a3 commit 9bbe41d

File tree

2 files changed

+221
-4
lines changed

2 files changed

+221
-4
lines changed

dlp/snippets/deid.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,139 @@ def deidentify_table_condition_replace_with_info_types(
12771277
# [END dlp_deidentify_table_condition_infotypes]
12781278

12791279

1280+
# [START dlp_deidentify_table_condition_masking]
1281+
def deidentify_table_condition_masking(
1282+
project,
1283+
table_data,
1284+
deid_content_list,
1285+
condition_field=None,
1286+
condition_operator=None,
1287+
condition_value=None,
1288+
masking_character=None
1289+
):
1290+
""" Uses the Data Loss Prevention API to de-identify sensitive data in a
1291+
table by masking them based on a condition.
1292+
1293+
Args:
1294+
project: The Google Cloud project id to use as a parent resource.
1295+
table_data: Json string representing table data.
1296+
deid_content_list: A list of fields in table to de-identify.
1297+
condition_field: A table Field within the record this condition is evaluated against.
1298+
condition_operator: Operator used to compare the field or infoType to the value. One of:
1299+
RELATIONAL_OPERATOR_UNSPECIFIED, EQUAL_TO, NOT_EQUAL_TO, GREATER_THAN, LESS_THAN, GREATER_THAN_OR_EQUALS,
1300+
LESS_THAN_OR_EQUALS, EXISTS.
1301+
condition_value: Value to compare against. [Mandatory, except for ``EXISTS`` tests.].
1302+
masking_character: The character to mask matching sensitive data with.
1303+
1304+
Returns:
1305+
De-identified table is returned;
1306+
the response from the API is also printed to the terminal.
1307+
1308+
Example:
1309+
table_data = {
1310+
"header":[
1311+
"email",
1312+
"phone number",
1313+
"age",
1314+
"happiness_score"
1315+
],
1316+
"rows":[
1317+
[
1318+
1319+
"4232342345",
1320+
"35",
1321+
"21"
1322+
],
1323+
[
1324+
1325+
"4253458383",
1326+
"64",
1327+
"34"
1328+
]
1329+
]
1330+
}
1331+
1332+
>> $ python deid.py deid_table_condition_mask \
1333+
'{"header": ["email", "phone number", "age", "happiness_score"],
1334+
"rows": [["[email protected]", "4232342345", "35", "21"],
1335+
["[email protected]", "4253458383", "64", "34"]]}' \
1336+
["happiness_score"] "age" "GREATER_THAN" 50
1337+
>> '{"header": ["email", "phone number", "age", "happiness_score"],
1338+
"rows": [["[email protected]", "4232342345", "35", "21"],
1339+
["[email protected]", "4253458383", "64", "**"]]}'
1340+
"""
1341+
1342+
# Import the client library
1343+
import google.cloud.dlp
1344+
1345+
# Instantiate a client.
1346+
dlp = google.cloud.dlp_v2.DlpServiceClient()
1347+
1348+
# Construct the `table`. For more details on the table schema, please see
1349+
# https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
1350+
headers = [{"name": val} for val in table_data["header"]]
1351+
rows = []
1352+
for row in table_data["rows"]:
1353+
rows.append({"values": [{"string_value": cell_val} for cell_val in row]})
1354+
1355+
table = {"headers": headers, "rows": rows}
1356+
1357+
# Construct the `item`
1358+
item = {"table": table}
1359+
1360+
# Specify fields to be de-identified
1361+
deid_content_list = [{"name": _i} for _i in deid_content_list]
1362+
1363+
# Construct condition list
1364+
condition = [
1365+
{
1366+
"field": {"name": condition_field},
1367+
"operator": condition_operator,
1368+
"value": {"integer_value": condition_value}
1369+
}
1370+
]
1371+
1372+
# Construct deidentify configuration dictionary
1373+
deidentify_config = {
1374+
"record_transformations": {
1375+
"field_transformations": [
1376+
{
1377+
"primitive_transformation": {
1378+
"character_mask_config": {
1379+
"masking_character": masking_character
1380+
}
1381+
},
1382+
"fields": deid_content_list,
1383+
"condition": {
1384+
"expressions": {
1385+
"conditions": {"conditions": condition}
1386+
}
1387+
}
1388+
}
1389+
]
1390+
}
1391+
}
1392+
1393+
# Convert the project id into a full resource id.
1394+
parent = f"projects/{project}"
1395+
1396+
# Call the API.
1397+
response = dlp.deidentify_content(
1398+
request={
1399+
"parent": parent,
1400+
"deidentify_config": deidentify_config,
1401+
"item": item
1402+
})
1403+
1404+
# Print the result
1405+
print("Table after de-identification: {}".format(response.item.table))
1406+
1407+
# Return the response
1408+
return response.item.table
1409+
1410+
# [END dlp_deidentify_table_condition_masking]
1411+
1412+
12801413
if __name__ == "__main__":
12811414
parser = argparse.ArgumentParser(description=__doc__)
12821415
subparsers = parser.add_subparsers(
@@ -1607,6 +1740,45 @@ def deidentify_table_condition_replace_with_info_types(
16071740
help="Value to compare against. [Mandatory, except for ``EXISTS`` tests.].",
16081741
)
16091742

1743+
table_condition_mask_parser = subparsers.add_parser(
1744+
"deid_table_condition_mask",
1745+
help="De-identify sensitive data in a table by masking"
1746+
"them based on a condition.",
1747+
)
1748+
table_condition_mask_parser.add_argument(
1749+
"project",
1750+
help="The Google Cloud project id to use as a parent resource.",
1751+
)
1752+
table_condition_mask_parser.add_argument(
1753+
"table_data",
1754+
help="Json string representing table data",
1755+
)
1756+
table_condition_mask_parser.add_argument(
1757+
"deid_content_list",
1758+
help="A list of fields in table to de-identify."
1759+
)
1760+
table_condition_mask_parser.add_argument(
1761+
"--condition_field",
1762+
help="A table Field within the record this condition is evaluated "
1763+
"against.",
1764+
)
1765+
table_condition_mask_parser.add_argument(
1766+
"--condition_operator",
1767+
help="Operator used to compare the field or infoType to the value. "
1768+
"One of: RELATIONAL_OPERATOR_UNSPECIFIED, EQUAL_TO, NOT_EQUAL_TO, "
1769+
"GREATER_THAN, LESS_THAN, GREATER_THAN_OR_EQUALS, LESS_THAN_OR_EQUALS, "
1770+
"EXISTS.",
1771+
)
1772+
table_condition_mask_parser.add_argument(
1773+
"--condition_value",
1774+
help="Value to compare against. [Mandatory, except for ``EXISTS`` tests.].",
1775+
)
1776+
table_condition_mask_parser.add_argument(
1777+
"-m",
1778+
"--masking_character",
1779+
help="The character to mask matching sensitive data with.",
1780+
)
1781+
16101782
args = parser.parse_args()
16111783

16121784
if args.content == "deid_mask":
@@ -1687,3 +1859,13 @@ def deidentify_table_condition_replace_with_info_types(
16871859
condition_operator=args.condition_operator,
16881860
condition_value=args.condition_value
16891861
)
1862+
elif args.content == "deid_table_condition_mask":
1863+
deidentify_table_condition_masking(
1864+
args.project,
1865+
args.table_data,
1866+
args.deid_content_list,
1867+
condition_field=args.condition_field,
1868+
condition_operator=args.condition_operator,
1869+
condition_value=args.condition_value,
1870+
masking_character=args.masking_character
1871+
)

dlp/snippets/deid_test.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,14 @@
3939
DATE_SHIFTED_AMOUNT = 30
4040
DATE_FIELDS = ["birth_date", "register_date"]
4141
CSV_CONTEXT_FIELD = "name"
42-
TABLE_DATA = {"header": ["age", "patient", "happiness_score"],
43-
"rows": [["101", "Charles Dickens", "95"],
44-
["22", "Jane Austen", "21"],
45-
["90", "Mark Twain", "75"]]}
42+
TABLE_DATA = {
43+
"header": ["age", "patient", "happiness_score"],
44+
"rows": [
45+
["101", "Charles Dickens", "95"],
46+
["22", "Jane Austen", "21"],
47+
["90", "Mark Twain", "75"]
48+
]
49+
}
4650

4751

4852
@pytest.fixture(scope="module")
@@ -356,3 +360,34 @@ def test_deidentify_table_condition_replace_with_info_types(capsys):
356360
assert "[PERSON_NAME] name was a curse invented by [PERSON_NAME]." in out
357361
assert "There are 14 kisses in Jane Austen\\\'s novels." in out
358362
assert "[PERSON_NAME] loved cats." in out
363+
364+
365+
def test_deidentify_table_condition_masking(capsys):
366+
deid_list = ["happiness_score"]
367+
deid.deidentify_table_condition_masking(
368+
GCLOUD_PROJECT,
369+
TABLE_DATA,
370+
deid_list,
371+
condition_field="age",
372+
condition_operator="GREATER_THAN",
373+
condition_value=89,
374+
)
375+
out, _ = capsys.readouterr()
376+
assert "string_value: \"**\"" in out
377+
assert "string_value: \"21\"" in out
378+
379+
380+
def test_deidentify_table_condition_masking_with_masking_character_specified(capsys):
381+
deid_list = ["happiness_score"]
382+
deid.deidentify_table_condition_masking(
383+
GCLOUD_PROJECT,
384+
TABLE_DATA,
385+
deid_list,
386+
condition_field="age",
387+
condition_operator="GREATER_THAN",
388+
condition_value=89,
389+
masking_character="#"
390+
)
391+
out, _ = capsys.readouterr()
392+
assert "string_value: \"##\"" in out
393+
assert "string_value: \"21\"" in out

0 commit comments

Comments
 (0)