|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# This software was developed at the National Institute of Standards |
| 4 | +# and Technology by employees of the Federal Government in the course |
| 5 | +# of their official duties. Pursuant to title 17 Section 105 of the |
| 6 | +# United States Code this software is not subject to copyright |
| 7 | +# protection and is in the public domain. NIST assumes no |
| 8 | +# responsibility whatsoever for its use by other parties, and makes |
| 9 | +# no guarantees, expressed or implied, about its quality, |
| 10 | +# reliability, or any other characteristic. |
| 11 | +# |
| 12 | +# We would appreciate acknowledgement if the software is used. |
| 13 | + |
| 14 | +""" |
| 15 | +This library provides supporting constants and functions for generating deterministic UUIDs (version 5) for UCO Hash and Facet nodes. |
| 16 | +
|
| 17 | +There are two general patterns implemented: |
| 18 | +
|
| 19 | +1. Some objects are "wholly specified" by their properties. The leading example of this is uco-types:Hash, which has only the properties hashMethod and hashValue, and both are required to be provided in order to be conformant with UCO. The function `hash_method_value_uuid` implements a scheme to generate UUIDs for uco-types:Hash nodes based on this pattern. |
| 20 | +2. A pattern based on inherence generates UUIDv5s based on how an inherent object (a.k.a. UcoInherentCharacterizationThing) structurally relates to the object in which it inheres. For instance, a Facet is understood to only relate to its UcoObject by linking with the uco-core:hasFacet property. So, a Facet's UUID is determined uniquely by (1) the "UUID namespace" of its corresponding UcoObject, and (2) its OWL Class. |
| 21 | + A. The term "UUID namespace" is described in RFC 4122 Section 4.3 [#rfc4122s43]_ , and is not intended be confused with `rdflib.term.Namespace`. For any uco-core:UcoThing (or even owl:Thing), the function `inherence_uuid` defines the procedure for either extracting or generating a UUID for use as a namespace. |
| 22 | +
|
| 23 | +This module is independent of, and complements, `case_utils.local_uuid`, which provides deterministic UUIDs based on calling process's environment. |
| 24 | +
|
| 25 | +References |
| 26 | +========== |
| 27 | +
|
| 28 | +.. [#rfc4122s43] https://datatracker.ietf.org/doc/html/rfc4122#section-4.3 |
| 29 | +
|
| 30 | +
|
| 31 | +Examples |
| 32 | +======== |
| 33 | +
|
| 34 | +A knowledge base ontology currently uses a prefix 'kb:', expanding to 'http://example.org/kb/'. This knowledge base has a node kb:File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9. What is the IRI of its FileFacet? |
| 35 | +
|
| 36 | +>>> from case_utils.namespace import NS_UCO_OBSERVABLE |
| 37 | +>>> ns_kb = Namespace("http://example.org/kb/") |
| 38 | +>>> n_file = ns_kb["File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9"] |
| 39 | +>>> n_file_facet = get_facet_uriref(n_file, NS_UCO_OBSERVABLE.FileFacet, namespace=ns_kb) |
| 40 | +>>> n_file_facet |
| 41 | +rdflib.term.URIRef('http://example.org/kb/FileFacet-01d292e3-0f38-5974-868d-006ef07f5186') |
| 42 | +
|
| 43 | +A documentation policy change has been enacted, and now all knowledge base individuals need to use the URN example form. What is the FileFacet IRI now? |
| 44 | +
|
| 45 | +>>> ns_kb_2 = Namespace("urn:example:kb:") |
| 46 | +>>> file_iri_2: str = "urn:example:kb:File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9" |
| 47 | +>>> n_file_2 = URIRef(file_iri_2) |
| 48 | +>>> n_file_facet_2 = get_facet_uriref(n_file_2, NS_UCO_OBSERVABLE.FileFacet, namespace=ns_kb_2) |
| 49 | +>>> n_file_facet_2 |
| 50 | +rdflib.term.URIRef('urn:example:kb:FileFacet-01d292e3-0f38-5974-868d-006ef07f5186') |
| 51 | +
|
| 52 | +The two IRIs end with the same UUID. |
| 53 | +
|
| 54 | +>>> assert str(n_file_facet)[-36:] == str(n_file_facet_2)[-36:] |
| 55 | +""" |
| 56 | + |
| 57 | +__version__ = "0.0.3" |
| 58 | + |
| 59 | +import binascii |
| 60 | +import re |
| 61 | +import uuid |
| 62 | +from typing import Any, Dict, Optional, Tuple |
| 63 | + |
| 64 | +from rdflib import Literal, Namespace, URIRef |
| 65 | + |
| 66 | +from case_utils.namespace import NS_UCO_CORE, NS_UCO_VOCABULARY, NS_XSD |
| 67 | + |
| 68 | +L_MD5 = Literal("MD5", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 69 | +L_SHA1 = Literal("SHA1", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 70 | +L_SHA256 = Literal("SHA256", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 71 | +L_SHA3_256 = Literal("SHA3-256", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 72 | +L_SHA3_512 = Literal("SHA3-512", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 73 | +L_SHA384 = Literal("SHA384", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 74 | +L_SHA512 = Literal("SHA512", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 75 | +L_SSDEEP = Literal("SSDEEP", datatype=NS_UCO_VOCABULARY.HashNameVocab) |
| 76 | + |
| 77 | +# Key: hashMethod literal. |
| 78 | +# Value: Tuple. |
| 79 | +# * Lowercase spelling |
| 80 | +HASH_METHOD_CASTINGS: Dict[Literal, Tuple[str, Optional[int]]] = { |
| 81 | + L_MD5: ("md5", 32), |
| 82 | + L_SHA1: ("sha1", 40), |
| 83 | + L_SHA256: ("sha256", 64), |
| 84 | + L_SHA3_256: ("sha3-256", 64), |
| 85 | + L_SHA3_512: ("sha3-512", 128), |
| 86 | + L_SHA384: ("sha384", 96), |
| 87 | + L_SHA512: ("sha512", 128), |
| 88 | + L_SSDEEP: ("ssdeep", None), |
| 89 | +} |
| 90 | + |
| 91 | +RX_UUID = re.compile( |
| 92 | + "[0-9a-f]{8}-[0-9a-f]{4}-[0-5][0-9a-f]{3}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE |
| 93 | +) |
| 94 | + |
| 95 | + |
| 96 | +def inherence_uuid(n_uco_thing: URIRef, *args: Any, **kwargs: Any) -> uuid.UUID: |
| 97 | + """ |
| 98 | + This function returns a UUIDv5 for any UcoThing, that can be used as a UUID Namespace in further `uuid.uuidv5` calls. |
| 99 | +
|
| 100 | + In the case that the UcoThing ends with a UUID, that UUID string will be returned wrapped in a UUID object. In all other cases, a UUID version 5 object will be returned for the node as a name under the URL namespace [#rfc4122ac]_. |
| 101 | +
|
| 102 | + References |
| 103 | + ========== |
| 104 | +
|
| 105 | + .. [#rfc4122ac] https://datatracker.ietf.org/doc/html/rfc4122#appendix-C |
| 106 | +
|
| 107 | + Examples |
| 108 | + ======== |
| 109 | +
|
| 110 | + A File node will need its FileFacet IRI determined. What will be the base UUID namespace for determining this IRI as well as other inherent graph objects? |
| 111 | +
|
| 112 | + >>> file_iri: str = "http://example.org/kb/File-ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9" |
| 113 | + >>> n_file = URIRef(file_iri) |
| 114 | + >>> file_uuid_namespace: uuid.UUID = inherence_uuid(n_file) |
| 115 | + >>> file_uuid_namespace |
| 116 | + UUID('ac6b44cf-dc6b-4f2c-a09d-c9beb0a345a9') |
| 117 | +
|
| 118 | + The CASE homepage is being treated as an OWL NamedIndividual in this knowledge base, with its URL as its IRI. What is its base UUID namespace? |
| 119 | +
|
| 120 | + >>> case_homepage_url: str = "https://caseontology.org/" |
| 121 | + >>> n_case_homepage = URIRef(case_homepage_url) |
| 122 | + >>> case_homepage_uuid_namespace = inherence_uuid(n_case_homepage) |
| 123 | + >>> case_homepage_uuid_namespace |
| 124 | + UUID('2c6406b7-3396-5fdd-b9bf-c6e21273e40a') |
| 125 | + """ |
| 126 | + node_iri = str(n_uco_thing) |
| 127 | + if len(node_iri) < 40 or RX_UUID.search(node_iri) is None: |
| 128 | + # <40 -> Too short to have a UUID and scheme. |
| 129 | + return uuid.uuid5(uuid.NAMESPACE_URL, node_iri) |
| 130 | + else: |
| 131 | + return uuid.UUID(node_iri[-36:]) |
| 132 | + |
| 133 | + |
| 134 | +def facet_inherence_uuid( |
| 135 | + uco_object_inherence_uuid: uuid.UUID, |
| 136 | + n_facet_class: URIRef, |
| 137 | + *args: Any, |
| 138 | + **kwargs: Any |
| 139 | +) -> uuid.UUID: |
| 140 | + """ |
| 141 | + :param n_facet_class: This node is expected to be the `rdflib.term.URIRef` for an OWL Class that is either in UCO or extends a class in UCO, such as `case_utils.namespace.NS_UCO_OBSERVABLE.FileFacet`. The Facet class SHOULD be a 'leaf' class - that is, it should have no OWL subclasses. (This 'SHOULD' might become a more stringent requirement in the future. uco-core:Facet must not be used. There is some question on how this rule should apply for uco-observable:WifiAddressFacet and its parent class uco-observable:MACAddressFacet.) |
| 142 | + :type n_facet_class: rdflib.term.URIRef |
| 143 | + """ |
| 144 | + |
| 145 | + if n_facet_class == NS_UCO_CORE.Facet: |
| 146 | + raise ValueError("Requested Facet class is not a leaf Facet class.") |
| 147 | + # NOTE: Further reviewing whether n_facet_class pertains to a Facet subclass is not done in this library. Both a set of all such known classes, as well as an extension mechanism for non-standard Facet subclasses (probably either a Set or Graph as an extra parameter), would need to be implemented. |
| 148 | + |
| 149 | + return uuid.uuid5(uco_object_inherence_uuid, str(n_facet_class)) |
| 150 | + |
| 151 | + |
| 152 | +def get_facet_uriref( |
| 153 | + n_uco_object: URIRef, |
| 154 | + n_facet_class: URIRef, |
| 155 | + *args: Any, |
| 156 | + namespace: Namespace, |
| 157 | + **kwargs: Any |
| 158 | +) -> URIRef: |
| 159 | + """ |
| 160 | + :param namespace: An RDFLib Namespace object to use for prefixing the Facet IRI with a knowledge base prefix IRI. |
| 161 | + :type namespace rdflib.Namespace: |
| 162 | +
|
| 163 | + Examples |
| 164 | + ======== |
| 165 | +
|
| 166 | + What is the URLFacet pertaining to the Nitroba University Scenario's PCAP file, when being interpreted as a Simple Storage Service (S3) object? |
| 167 | +
|
| 168 | + >>> from case_utils.namespace import NS_UCO_OBSERVABLE |
| 169 | + >>> pcap_url: str = "s3://digitalcorpora/corpora/scenarios/2008-nitroba/nitroba.pcap" |
| 170 | + >>> n_pcap = URIRef(pcap_url) |
| 171 | + >>> ns_kb = Namespace("http://example.org/kb/") |
| 172 | + >>> n_pcap_url_facet = get_facet_uriref(n_pcap, NS_UCO_OBSERVABLE.URLFacet, namespace=ns_kb) |
| 173 | + >>> n_pcap_url_facet |
| 174 | + rdflib.term.URIRef('http://example.org/kb/URLFacet-4b6023da-dbc4-5e1e-9a2f-aca2a6f6405c') |
| 175 | + """ |
| 176 | + uco_object_uuid_namespace: uuid.UUID = inherence_uuid(n_uco_object) |
| 177 | + facet_uuid = facet_inherence_uuid(uco_object_uuid_namespace, n_facet_class) |
| 178 | + |
| 179 | + # NOTE: This encodes an assumption that Facets (including extension Facets) use the "Slash" IRI style. |
| 180 | + facet_class_local_name = str(n_facet_class).rsplit("/")[-1] |
| 181 | + |
| 182 | + return namespace[facet_class_local_name + "-" + str(facet_uuid)] |
| 183 | + |
| 184 | + |
| 185 | +def hash_method_value_uuid(l_hash_method: Literal, l_hash_value: Literal) -> uuid.UUID: |
| 186 | + """ |
| 187 | + This function generates a UUID for a UCO Hash object, solely based on its two required properties: uco-types:hashMethod and uco-types:hashValue. |
| 188 | +
|
| 189 | + The UUIDv5 seed data for Hash nodes is a URN following the scheme in this draft IETF memo: |
| 190 | +
|
| 191 | + https://datatracker.ietf.org/doc/html/draft-thiemann-hash-urn-01 |
| 192 | +
|
| 193 | + Note that at the time of this writing, that memo was expired (expiration date 2004-03-04) and did not have a linked superseding document. |
| 194 | + """ |
| 195 | + |
| 196 | + if l_hash_value.datatype != NS_XSD.hexBinary: |
| 197 | + raise ValueError("Expected hexBinary datatype for l_hash_value.") |
| 198 | + hash_value_str: str = binascii.hexlify(l_hash_value.toPython()).decode().lower() |
| 199 | + |
| 200 | + hash_method_str = HASH_METHOD_CASTINGS[l_hash_method][0] |
| 201 | + |
| 202 | + urn_template = "urn:hash::%s:%s" |
| 203 | + urn_populated = urn_template % (hash_method_str, hash_value_str) |
| 204 | + |
| 205 | + return uuid.uuid5(uuid.NAMESPACE_URL, urn_populated) |
0 commit comments