Skip to content

Commit 435d67e

Browse files
authored
Merge pull request #113 from casework/release-0.11.0
Release 0.11.0
2 parents f47bd6c + 6da50db commit 435d67e

15 files changed

+582
-138
lines changed

case_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@
1111
#
1212
# We would appreciate acknowledgement if the software is used.
1313

14-
__version__ = "0.10.0"
14+
__version__ = "0.11.0"
1515

1616
from . import local_uuid # noqa: F401

case_utils/case_file/__init__.py

Lines changed: 70 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
This module creates a graph object that provides a basic UCO characterization of a single file. The gathered metadata is among the more "durable" file characteristics, i.e. characteristics that would remain consistent when transferring a file between locations.
1616
"""
1717

18-
__version__ = "0.4.0"
18+
__version__ = "0.5.0"
1919

2020
import argparse
2121
import datetime
@@ -27,7 +27,7 @@
2727

2828
import rdflib
2929

30-
import case_utils
30+
import case_utils.inherent_uuid
3131
from case_utils.namespace import (
3232
NS_RDF,
3333
NS_UCO_CORE,
@@ -49,6 +49,8 @@ class HashDict(typing.NamedTuple):
4949
sha1: str
5050
sha256: str
5151
sha512: str
52+
sha3_256: str
53+
sha3_512: str
5254

5355

5456
def create_file_node(
@@ -58,6 +60,9 @@ def create_file_node(
5860
node_prefix: str = DEFAULT_PREFIX,
5961
disable_hashes: bool = False,
6062
disable_mtime: bool = False,
63+
*args: typing.Any,
64+
use_deterministic_uuids: bool = False,
65+
**kwargs: typing.Any,
6166
) -> rdflib.URIRef:
6267
r"""
6368
This function characterizes the file at filepath.
@@ -68,7 +73,7 @@ def create_file_node(
6873
:param filepath: The path to the file to characterize. Can be relative or absolute.
6974
:type filepath: str
7075
71-
:param node_iri: The desired full IRI for the node. If absent, will make an IRI of the pattern ``ns_base + 'file-' + uuid4``
76+
:param node_iri: The desired full IRI for the node. If absent, will make an IRI of the pattern ``ns_base + 'File-' + uuid``
7277
:type node_iri: str
7378
7479
:param node_prefix: The base prefix to use if node_iri is not supplied.
@@ -86,7 +91,7 @@ def create_file_node(
8691
node_namespace = rdflib.Namespace(node_prefix)
8792

8893
if node_iri is None:
89-
node_slug = "file-" + case_utils.local_uuid.local_uuid()
94+
node_slug = "File-" + case_utils.local_uuid.local_uuid()
9095
node_iri = node_namespace[node_slug]
9196
n_file = rdflib.URIRef(node_iri)
9297
graph.add((n_file, NS_RDF.type, NS_UCO_OBSERVABLE.File))
@@ -95,7 +100,15 @@ def create_file_node(
95100
literal_basename = rdflib.Literal(basename)
96101

97102
file_stat = os.stat(filepath)
98-
n_file_facet = node_namespace["file-facet-" + case_utils.local_uuid.local_uuid()]
103+
104+
n_file_facet: rdflib.URIRef
105+
if use_deterministic_uuids:
106+
n_file_facet = case_utils.inherent_uuid.get_facet_uriref(
107+
n_file, NS_UCO_OBSERVABLE.FileFacet, namespace=node_namespace
108+
)
109+
else:
110+
n_file_facet = node_namespace["FileFacet-" + case_utils.local_uuid.local_uuid()]
111+
99112
graph.add(
100113
(
101114
n_file_facet,
@@ -122,9 +135,16 @@ def create_file_node(
122135
graph.add((n_file_facet, NS_UCO_OBSERVABLE.modifiedTime, literal_mtime))
123136

124137
if not disable_hashes:
125-
n_contentdata_facet = node_namespace[
126-
"content-data-facet-" + case_utils.local_uuid.local_uuid()
127-
]
138+
n_contentdata_facet: rdflib.URIRef
139+
if use_deterministic_uuids:
140+
n_contentdata_facet = case_utils.inherent_uuid.get_facet_uriref(
141+
n_file, NS_UCO_OBSERVABLE.ContentDataFacet, namespace=node_namespace
142+
)
143+
else:
144+
n_contentdata_facet = node_namespace[
145+
"ContentDataFacet-" + case_utils.local_uuid.local_uuid()
146+
]
147+
128148
graph.add((n_file, NS_UCO_CORE.hasFacet, n_contentdata_facet))
129149
graph.add(
130150
(n_contentdata_facet, NS_RDF.type, NS_UCO_OBSERVABLE.ContentDataFacet)
@@ -141,6 +161,8 @@ def create_file_node(
141161
sha1obj = hashlib.sha1()
142162
sha256obj = hashlib.sha256()
143163
sha512obj = hashlib.sha512()
164+
sha3_256obj = hashlib.sha3_256()
165+
sha3_512obj = hashlib.sha3_512()
144166
stashed_error = None
145167
byte_tally = 0
146168
with open(filepath, "rb") as in_fh:
@@ -159,6 +181,8 @@ def create_file_node(
159181
sha1obj.update(buf)
160182
sha256obj.update(buf)
161183
sha512obj.update(buf)
184+
sha3_256obj.update(buf)
185+
sha3_512obj.update(buf)
162186
if stashed_error is not None:
163187
raise stashed_error
164188
current_hashdict = HashDict(
@@ -167,6 +191,8 @@ def create_file_node(
167191
sha1obj.hexdigest(),
168192
sha256obj.hexdigest(),
169193
sha512obj.hexdigest(),
194+
sha3_256obj.hexdigest(),
195+
sha3_512obj.hexdigest(),
170196
)
171197
if last_hashdict == current_hashdict:
172198
successful_hashdict = current_hashdict
@@ -194,26 +220,48 @@ def create_file_node(
194220

195221
# Add confirmed hashes into graph.
196222
for key in successful_hashdict._fields:
197-
if key not in ("md5", "sha1", "sha256", "sha512"):
223+
if key not in ("md5", "sha1", "sha256", "sha512", "sha3_256", "sha3_512"):
198224
continue
199-
n_hash = node_namespace["hash-" + case_utils.local_uuid.local_uuid()]
225+
226+
l_hash_method: rdflib.Literal
227+
if key in ("sha3_256", "sha3_512"):
228+
l_hash_method = rdflib.Literal(
229+
key.replace("_", "-").upper(),
230+
datatype=NS_UCO_VOCABULARY.HashNameVocab,
231+
)
232+
else:
233+
l_hash_method = rdflib.Literal(
234+
key.upper(), datatype=NS_UCO_VOCABULARY.HashNameVocab
235+
)
236+
237+
hash_value: str = getattr(successful_hashdict, key)
238+
l_hash_value = rdflib.Literal(hash_value.upper(), datatype=NS_XSD.hexBinary)
239+
240+
hash_uuid: str
241+
if use_deterministic_uuids:
242+
hash_uuid = str(
243+
case_utils.inherent_uuid.hash_method_value_uuid(
244+
l_hash_method, l_hash_value
245+
)
246+
)
247+
else:
248+
hash_uuid = case_utils.local_uuid.local_uuid()
249+
n_hash = node_namespace["Hash-" + hash_uuid]
250+
200251
graph.add((n_contentdata_facet, NS_UCO_OBSERVABLE.hash, n_hash))
201252
graph.add((n_hash, NS_RDF.type, NS_UCO_TYPES.Hash))
202253
graph.add(
203254
(
204255
n_hash,
205256
NS_UCO_TYPES.hashMethod,
206-
rdflib.Literal(
207-
key.upper(), datatype=NS_UCO_VOCABULARY.HashNameVocab
208-
),
257+
l_hash_method,
209258
)
210259
)
211-
hash_value = getattr(successful_hashdict, key)
212260
graph.add(
213261
(
214262
n_hash,
215263
NS_UCO_TYPES.hashValue,
216-
rdflib.Literal(hash_value.upper(), datatype=NS_XSD.hexBinary),
264+
l_hash_value,
217265
)
218266
)
219267

@@ -226,6 +274,11 @@ def main() -> None:
226274
parser.add_argument("--debug", action="store_true")
227275
parser.add_argument("--disable-hashes", action="store_true")
228276
parser.add_argument("--disable-mtime", action="store_true")
277+
parser.add_argument(
278+
"--use-deterministic-uuids",
279+
action="store_true",
280+
help="Use UUIDs computed using the case_utils.inherent_uuid module.",
281+
)
229282
parser.add_argument(
230283
"--output-format", help="Override extension-based format guesser."
231284
)
@@ -258,14 +311,15 @@ def main() -> None:
258311
context_dictionary = {k: v for (k, v) in graph.namespace_manager.namespaces()}
259312
serialize_kwargs["context"] = context_dictionary
260313

261-
node_iri = NS_BASE["file-" + case_utils.local_uuid.local_uuid()]
314+
node_iri = NS_BASE["File-" + case_utils.local_uuid.local_uuid()]
262315
create_file_node(
263316
graph,
264317
args.in_file,
265318
node_iri=node_iri,
266319
node_prefix=args.base_prefix,
267320
disable_hashes=args.disable_hashes,
268321
disable_mtime=args.disable_mtime,
322+
use_deterministic_uuids=args.use_deterministic_uuids,
269323
)
270324

271325
graph.serialize(args.out_graph, **serialize_kwargs)

0 commit comments

Comments
 (0)