Skip to content

Commit ccf32f9

Browse files
authored
Merge pull request #112 from casework/add_inherence_uuid_namespaces
Add inherence UUID functions
2 parents f6c72ab + 712ea3a commit ccf32f9

File tree

9 files changed

+440
-188
lines changed

9 files changed

+440
-188
lines changed

case_utils/case_file/__init__.py

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
import rdflib
2929

30-
import case_utils
30+
import case_utils.inherent_uuid
3131
from case_utils.namespace import (
3232
NS_RDF,
3333
NS_UCO_CORE,
@@ -60,6 +60,9 @@ def create_file_node(
6060
node_prefix: str = DEFAULT_PREFIX,
6161
disable_hashes: bool = False,
6262
disable_mtime: bool = False,
63+
*args: typing.Any,
64+
use_deterministic_uuids: bool = False,
65+
**kwargs: typing.Any,
6366
) -> rdflib.URIRef:
6467
r"""
6568
This function characterizes the file at filepath.
@@ -70,7 +73,7 @@ def create_file_node(
7073
:param filepath: The path to the file to characterize. Can be relative or absolute.
7174
:type filepath: str
7275
73-
:param node_iri: The desired full IRI for the node. If absent, will make an IRI of the pattern ``ns_base + 'File-' + uuid4``
76+
:param node_iri: The desired full IRI for the node. If absent, will make an IRI of the pattern ``ns_base + 'File-' + uuid``
7477
:type node_iri: str
7578
7679
:param node_prefix: The base prefix to use if node_iri is not supplied.
@@ -97,7 +100,15 @@ def create_file_node(
97100
literal_basename = rdflib.Literal(basename)
98101

99102
file_stat = os.stat(filepath)
100-
n_file_facet = node_namespace["FileFacet-" + case_utils.local_uuid.local_uuid()]
103+
104+
n_file_facet: rdflib.URIRef
105+
if use_deterministic_uuids:
106+
n_file_facet = case_utils.inherent_uuid.get_facet_uriref(
107+
n_file, NS_UCO_OBSERVABLE.FileFacet, namespace=node_namespace
108+
)
109+
else:
110+
n_file_facet = node_namespace["FileFacet-" + case_utils.local_uuid.local_uuid()]
111+
101112
graph.add(
102113
(
103114
n_file_facet,
@@ -124,9 +135,16 @@ def create_file_node(
124135
graph.add((n_file_facet, NS_UCO_OBSERVABLE.modifiedTime, literal_mtime))
125136

126137
if not disable_hashes:
127-
n_contentdata_facet = node_namespace[
128-
"content-data-facet-" + case_utils.local_uuid.local_uuid()
129-
]
138+
n_contentdata_facet: rdflib.URIRef
139+
if use_deterministic_uuids:
140+
n_contentdata_facet = case_utils.inherent_uuid.get_facet_uriref(
141+
n_file, NS_UCO_OBSERVABLE.ContentDataFacet, namespace=node_namespace
142+
)
143+
else:
144+
n_contentdata_facet = node_namespace[
145+
"ContentDataFacet-" + case_utils.local_uuid.local_uuid()
146+
]
147+
130148
graph.add((n_file, NS_UCO_CORE.hasFacet, n_contentdata_facet))
131149
graph.add(
132150
(n_contentdata_facet, NS_RDF.type, NS_UCO_OBSERVABLE.ContentDataFacet)
@@ -204,9 +222,8 @@ def create_file_node(
204222
for key in successful_hashdict._fields:
205223
if key not in ("md5", "sha1", "sha256", "sha512", "sha3_256", "sha3_512"):
206224
continue
207-
n_hash = node_namespace["hash-" + case_utils.local_uuid.local_uuid()]
208-
graph.add((n_contentdata_facet, NS_UCO_OBSERVABLE.hash, n_hash))
209-
graph.add((n_hash, NS_RDF.type, NS_UCO_TYPES.Hash))
225+
226+
l_hash_method: rdflib.Literal
210227
if key in ("sha3_256", "sha3_512"):
211228
l_hash_method = rdflib.Literal(
212229
key.replace("_", "-").upper(),
@@ -216,19 +233,35 @@ def create_file_node(
216233
l_hash_method = rdflib.Literal(
217234
key.upper(), datatype=NS_UCO_VOCABULARY.HashNameVocab
218235
)
236+
237+
hash_value: str = getattr(successful_hashdict, key)
238+
l_hash_value = rdflib.Literal(hash_value.upper(), datatype=NS_XSD.hexBinary)
239+
240+
hash_uuid: str
241+
if use_deterministic_uuids:
242+
hash_uuid = str(
243+
case_utils.inherent_uuid.hash_method_value_uuid(
244+
l_hash_method, l_hash_value
245+
)
246+
)
247+
else:
248+
hash_uuid = case_utils.local_uuid.local_uuid()
249+
n_hash = node_namespace["Hash-" + hash_uuid]
250+
251+
graph.add((n_contentdata_facet, NS_UCO_OBSERVABLE.hash, n_hash))
252+
graph.add((n_hash, NS_RDF.type, NS_UCO_TYPES.Hash))
219253
graph.add(
220254
(
221255
n_hash,
222256
NS_UCO_TYPES.hashMethod,
223257
l_hash_method,
224258
)
225259
)
226-
hash_value = getattr(successful_hashdict, key)
227260
graph.add(
228261
(
229262
n_hash,
230263
NS_UCO_TYPES.hashValue,
231-
rdflib.Literal(hash_value.upper(), datatype=NS_XSD.hexBinary),
264+
l_hash_value,
232265
)
233266
)
234267

@@ -241,6 +274,11 @@ def main() -> None:
241274
parser.add_argument("--debug", action="store_true")
242275
parser.add_argument("--disable-hashes", action="store_true")
243276
parser.add_argument("--disable-mtime", action="store_true")
277+
parser.add_argument(
278+
"--use-deterministic-uuids",
279+
action="store_true",
280+
help="Use UUIDs computed using the case_utils.inherent_uuid module.",
281+
)
244282
parser.add_argument(
245283
"--output-format", help="Override extension-based format guesser."
246284
)
@@ -281,6 +319,7 @@ def main() -> None:
281319
node_prefix=args.base_prefix,
282320
disable_hashes=args.disable_hashes,
283321
disable_mtime=args.disable_mtime,
322+
use_deterministic_uuids=args.use_deterministic_uuids,
284323
)
285324

286325
graph.serialize(args.out_graph, **serialize_kwargs)

case_utils/inherent_uuid.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#!/usr/bin/env python3
2+
3+
# This software was developed at the National Institute of Standards
4+
# and Technology by employees of the Federal Government in the course
5+
# of their official duties. Pursuant to title 17 Section 105 of the
6+
# United States Code this software is not subject to copyright
7+
# protection and is in the public domain. NIST assumes no
8+
# responsibility whatsoever for its use by other parties, and makes
9+
# no guarantees, expressed or implied, about its quality,
10+
# reliability, or any other characteristic.
11+
#
12+
# We would appreciate acknowledgement if the software is used.
13+
14+
"""
15+
This library provides supporting constants and functions for generating deterministic UUIDs (version 5) for UCO Hash and Facet nodes.
16+
17+
There are two general patterns implemented:
18+
19+
1. Some objects are "wholly specified" by their properties. The leading example of this is uco-types:Hash, which has only the properties hashMethod and hashValue, and both are required to be provided in order to be conformant with UCO. The function `hash_method_value_uuid` implements a scheme to generate UUIDs for uco-types:Hash nodes based on this pattern.
20+
2. A pattern based on inherence generates UUIDv5s based on how an inherent object (a.k.a. UcoInherentCharacterizationThing) structurally relates to the object in which it inheres. For instance, a Facet is understood to only relate to its UcoObject by linking with the uco-core:hasFacet property. So, a Facet's UUID is determined uniquely by (1) the "UUID namespace" of its corresponding UcoObject, and (2) its OWL Class.
21+
A. The term "UUID namespace" is described in RFC 4122 Section 4.3 [#rfc4122s43]_ , and is not intended be confused with `rdflib.term.Namespace`. For any uco-core:UcoThing (or even owl:Thing), the function `inherence_uuid` defines the procedure for either extracting or generating a UUID for use as a namespace.
22+
23+
This module is independent of, and complements, `case_utils.local_uuid`, which provides deterministic UUIDs based on calling process's environment.
24+
25+
References
26+
==========
27+
28+
.. [#rfc4122s43] https://datatracker.ietf.org/doc/html/rfc4122#section-4.3
29+
30+
31+
Examples
32+
========
33+
34+
A knowledge base ontology currently uses a prefix 'kb:', expanding to 'http://example.org/kb/'. This knowledge base has a node kb:File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9. What is the IRI of its FileFacet?
35+
36+
>>> from case_utils.namespace import NS_UCO_OBSERVABLE
37+
>>> ns_kb = Namespace("http://example.org/kb/")
38+
>>> n_file = ns_kb["File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9"]
39+
>>> n_file_facet = get_facet_uriref(n_file, NS_UCO_OBSERVABLE.FileFacet, namespace=ns_kb)
40+
>>> n_file_facet
41+
rdflib.term.URIRef('http://example.org/kb/FileFacet-01d292e3-0f38-5974-868d-006ef07f5186')
42+
43+
A documentation policy change has been enacted, and now all knowledge base individuals need to use the URN example form. What is the FileFacet IRI now?
44+
45+
>>> ns_kb_2 = Namespace("urn:example:kb:")
46+
>>> file_iri_2: str = "urn:example:kb:File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9"
47+
>>> n_file_2 = URIRef(file_iri_2)
48+
>>> n_file_facet_2 = get_facet_uriref(n_file_2, NS_UCO_OBSERVABLE.FileFacet, namespace=ns_kb_2)
49+
>>> n_file_facet_2
50+
rdflib.term.URIRef('urn:example:kb:FileFacet-01d292e3-0f38-5974-868d-006ef07f5186')
51+
52+
The two IRIs end with the same UUID.
53+
54+
>>> assert str(n_file_facet)[-36:] == str(n_file_facet_2)[-36:]
55+
"""
56+
57+
__version__ = "0.0.3"
58+
59+
import binascii
60+
import re
61+
import uuid
62+
from typing import Any, Dict, Optional, Tuple
63+
64+
from rdflib import Literal, Namespace, URIRef
65+
66+
from case_utils.namespace import NS_UCO_CORE, NS_UCO_VOCABULARY, NS_XSD
67+
68+
L_MD5 = Literal("MD5", datatype=NS_UCO_VOCABULARY.HashNameVocab)
69+
L_SHA1 = Literal("SHA1", datatype=NS_UCO_VOCABULARY.HashNameVocab)
70+
L_SHA256 = Literal("SHA256", datatype=NS_UCO_VOCABULARY.HashNameVocab)
71+
L_SHA3_256 = Literal("SHA3-256", datatype=NS_UCO_VOCABULARY.HashNameVocab)
72+
L_SHA3_512 = Literal("SHA3-512", datatype=NS_UCO_VOCABULARY.HashNameVocab)
73+
L_SHA384 = Literal("SHA384", datatype=NS_UCO_VOCABULARY.HashNameVocab)
74+
L_SHA512 = Literal("SHA512", datatype=NS_UCO_VOCABULARY.HashNameVocab)
75+
L_SSDEEP = Literal("SSDEEP", datatype=NS_UCO_VOCABULARY.HashNameVocab)
76+
77+
# Key: hashMethod literal.
78+
# Value: Tuple.
79+
# * Lowercase spelling
80+
HASH_METHOD_CASTINGS: Dict[Literal, Tuple[str, Optional[int]]] = {
81+
L_MD5: ("md5", 32),
82+
L_SHA1: ("sha1", 40),
83+
L_SHA256: ("sha256", 64),
84+
L_SHA3_256: ("sha3-256", 64),
85+
L_SHA3_512: ("sha3-512", 128),
86+
L_SHA384: ("sha384", 96),
87+
L_SHA512: ("sha512", 128),
88+
L_SSDEEP: ("ssdeep", None),
89+
}
90+
91+
RX_UUID = re.compile(
92+
"[0-9a-f]{8}-[0-9a-f]{4}-[0-5][0-9a-f]{3}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE
93+
)
94+
95+
96+
def inherence_uuid(n_uco_thing: URIRef, *args: Any, **kwargs: Any) -> uuid.UUID:
97+
"""
98+
This function returns a UUIDv5 for any UcoThing, that can be used as a UUID Namespace in further `uuid.uuidv5` calls.
99+
100+
In the case that the UcoThing ends with a UUID, that UUID string will be returned wrapped in a UUID object. In all other cases, a UUID version 5 object will be returned for the node as a name under the URL namespace [#rfc4122ac]_.
101+
102+
References
103+
==========
104+
105+
.. [#rfc4122ac] https://datatracker.ietf.org/doc/html/rfc4122#appendix-C
106+
107+
Examples
108+
========
109+
110+
A File node will need its FileFacet IRI determined. What will be the base UUID namespace for determining this IRI as well as other inherent graph objects?
111+
112+
>>> file_iri: str = "http://example.org/kb/File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9"
113+
>>> n_file = URIRef(file_iri)
114+
>>> file_uuid_namespace: uuid.UUID = inherence_uuid(n_file)
115+
>>> file_uuid_namespace
116+
UUID('ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9')
117+
118+
The CASE homepage is being treated as an OWL NamedIndividual in this knowledge base, with its URL as its IRI. What is its base UUID namespace?
119+
120+
>>> case_homepage_url: str = "https://caseontology.org/"
121+
>>> n_case_homepage = URIRef(case_homepage_url)
122+
>>> case_homepage_uuid_namespace = inherence_uuid(n_case_homepage)
123+
>>> case_homepage_uuid_namespace
124+
UUID('2c6406b7-3396-5fdd-b9bf-c6e21273e40a')
125+
"""
126+
node_iri = str(n_uco_thing)
127+
if len(node_iri) < 40 or RX_UUID.search(node_iri) is None:
128+
# <40 -> Too short to have a UUID and scheme.
129+
return uuid.uuid5(uuid.NAMESPACE_URL, node_iri)
130+
else:
131+
return uuid.UUID(node_iri[-36:])
132+
133+
134+
def facet_inherence_uuid(
135+
uco_object_inherence_uuid: uuid.UUID,
136+
n_facet_class: URIRef,
137+
*args: Any,
138+
**kwargs: Any
139+
) -> uuid.UUID:
140+
"""
141+
:param n_facet_class: This node is expected to be the `rdflib.term.URIRef` for an OWL Class that is either in UCO or extends a class in UCO, such as `case_utils.namespace.NS_UCO_OBSERVABLE.FileFacet`. The Facet class SHOULD be a 'leaf' class - that is, it should have no OWL subclasses. (This 'SHOULD' might become a more stringent requirement in the future. uco-core:Facet must not be used. There is some question on how this rule should apply for uco-observable:WifiAddressFacet and its parent class uco-observable:MACAddressFacet.)
142+
:type n_facet_class: rdflib.term.URIRef
143+
"""
144+
145+
if n_facet_class == NS_UCO_CORE.Facet:
146+
raise ValueError("Requested Facet class is not a leaf Facet class.")
147+
# NOTE: Further reviewing whether n_facet_class pertains to a Facet subclass is not done in this library. Both a set of all such known classes, as well as an extension mechanism for non-standard Facet subclasses (probably either a Set or Graph as an extra parameter), would need to be implemented.
148+
149+
return uuid.uuid5(uco_object_inherence_uuid, str(n_facet_class))
150+
151+
152+
def get_facet_uriref(
153+
n_uco_object: URIRef,
154+
n_facet_class: URIRef,
155+
*args: Any,
156+
namespace: Namespace,
157+
**kwargs: Any
158+
) -> URIRef:
159+
"""
160+
:param namespace: An RDFLib Namespace object to use for prefixing the Facet IRI with a knowledge base prefix IRI.
161+
:type namespace rdflib.Namespace:
162+
163+
Examples
164+
========
165+
166+
What is the URLFacet pertaining to the Nitroba University Scenario's PCAP file, when being interpreted as a Simple Storage Service (S3) object?
167+
168+
>>> from case_utils.namespace import NS_UCO_OBSERVABLE
169+
>>> pcap_url: str = "s3://digitalcorpora/corpora/scenarios/2008-nitroba/nitroba.pcap"
170+
>>> n_pcap = URIRef(pcap_url)
171+
>>> ns_kb = Namespace("http://example.org/kb/")
172+
>>> n_pcap_url_facet = get_facet_uriref(n_pcap, NS_UCO_OBSERVABLE.URLFacet, namespace=ns_kb)
173+
>>> n_pcap_url_facet
174+
rdflib.term.URIRef('http://example.org/kb/URLFacet-4b6023da-dbc4-5e1e-9a2f-aca2a6f6405c')
175+
"""
176+
uco_object_uuid_namespace: uuid.UUID = inherence_uuid(n_uco_object)
177+
facet_uuid = facet_inherence_uuid(uco_object_uuid_namespace, n_facet_class)
178+
179+
# NOTE: This encodes an assumption that Facets (including extension Facets) use the "Slash" IRI style.
180+
facet_class_local_name = str(n_facet_class).rsplit("/")[-1]
181+
182+
return namespace[facet_class_local_name + "-" + str(facet_uuid)]
183+
184+
185+
def hash_method_value_uuid(l_hash_method: Literal, l_hash_value: Literal) -> uuid.UUID:
186+
"""
187+
This function generates a UUID for a UCO Hash object, solely based on its two required properties: uco-types:hashMethod and uco-types:hashValue.
188+
189+
The UUIDv5 seed data for Hash nodes is a URN following the scheme in this draft IETF memo:
190+
191+
https://datatracker.ietf.org/doc/html/draft-thiemann-hash-urn-01
192+
193+
Note that at the time of this writing, that memo was expired (expiration date 2004-03-04) and did not have a linked superseding document.
194+
"""
195+
196+
if l_hash_value.datatype != NS_XSD.hexBinary:
197+
raise ValueError("Expected hexBinary datatype for l_hash_value.")
198+
hash_value_str: str = binascii.hexlify(l_hash_value.toPython()).decode().lower()
199+
200+
hash_method_str = HASH_METHOD_CASTINGS[l_hash_method][0]
201+
202+
urn_template = "urn:hash::%s:%s"
203+
urn_populated = urn_template % (hash_method_str, hash_value_str)
204+
205+
return uuid.uuid5(uuid.NAMESPACE_URL, urn_populated)

0 commit comments

Comments
 (0)