|
1 |
| -# Valid CSL (citeproc JSON) types as per |
2 |
| -# https://github.com/citation-style-language/schema/blob/4846e02f0a775a8272819204379a4f8d7f45c16c/csl-types.rnc#L5-L39 |
3 |
| -citeproc_types = { |
4 |
| - "article", |
5 |
| - "article-journal", |
6 |
| - "article-magazine", |
7 |
| - "article-newspaper", |
8 |
| - "bill", |
9 |
| - "book", |
10 |
| - "broadcast", |
11 |
| - "chapter", |
12 |
| - "dataset", |
13 |
| - "entry", |
14 |
| - "entry-dictionary", |
15 |
| - "entry-encyclopedia", |
16 |
| - "figure", |
17 |
| - "graphic", |
18 |
| - "interview", |
19 |
| - "legal_case", |
20 |
| - "legislation", |
21 |
| - "manuscript", |
22 |
| - "map", |
23 |
| - "motion_picture", |
24 |
| - "musical_score", |
25 |
| - "pamphlet", |
26 |
| - "paper-conference", |
27 |
| - "patent", |
28 |
| - "personal_communication", |
29 |
| - "post", |
30 |
| - "post-weblog", |
31 |
| - "report", |
32 |
| - "review", |
33 |
| - "review-book", |
34 |
| - "song", |
35 |
| - "speech", |
36 |
| - "thesis", |
37 |
| - "treaty", |
38 |
| - "webpage", |
39 |
| -} |
| 1 | +import copy |
| 2 | +import functools |
| 3 | +import logging |
| 4 | + |
| 5 | +import jsonref |
| 6 | +import jsonschema |
40 | 7 |
|
41 | 8 | citeproc_type_fixer = {
|
42 | 9 | 'journal-article': 'article-journal',
|
|
47 | 14 | 'reference-entry': 'entry',
|
48 | 15 | }
|
49 | 16 |
|
50 |
| -# Remove citeproc keys to fix pandoc-citeproc errors |
51 |
| -citeproc_remove_keys = [ |
52 |
| - # Error in $[0].ISSN[0]: failed to parse field ISSN: mempty |
53 |
| - 'ISSN', |
54 |
| - # Error in $[2].ISBN[0]: failed to parse field ISBN: mempty |
55 |
| - 'ISBN', |
56 |
| - # pandoc-citeproc expected Object not array for archive |
57 |
| - 'archive', |
58 |
| - # failed to parse field event: Could not read as string |
59 |
| - 'event', |
60 |
| - # remove the references of cited papers. Not neccessary and unwieldy. |
61 |
| - 'reference', |
62 |
| - # Error in $[26].categories[0][0]: failed to parse field categories: mempty |
63 |
| - 'categories', |
64 |
| -] |
65 |
| - |
66 |
| - |
67 |
| -def citeproc_passthrough(csl_item, set_id=None): |
| 17 | + |
| 18 | +def citeproc_passthrough(csl_item, set_id=None, prune=True): |
68 | 19 | """
|
69 |
| - Fix errors in a CSL item and optionally change its id. |
| 20 | + Fix errors in a CSL item, according to the CSL JSON schema, and optionally |
| 21 | + change its id. |
| 22 | +
|
| 23 | + http://docs.citationstyles.org/en/1.0.1/specification.html |
70 | 24 | http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html
|
71 | 25 | https://github.com/citation-style-language/schema/blob/master/csl-data.json
|
72 | 26 | """
|
73 | 27 | if set_id is not None:
|
74 | 28 | csl_item['id'] = set_id
|
| 29 | + logging.debug(f"Starting citeproc_passthrough with{'' if prune else 'out'} CSL pruning for id: {csl_item.get('id', 'id not specified')}") |
75 | 30 |
|
76 | 31 | # Correct invalid CSL item types
|
77 | 32 | # See https://github.com/CrossRef/rest-api-doc/issues/187
|
78 |
| - old_type = csl_item['type'] |
79 |
| - csl_type = citeproc_type_fixer.get(old_type, old_type) |
80 |
| - if csl_type not in citeproc_types: |
81 |
| - csl_type = 'entry' |
82 |
| - csl_item['type'] = csl_type |
83 |
| - |
84 |
| - # Remove problematic objects |
85 |
| - for key in citeproc_remove_keys: |
86 |
| - csl_item.pop(key, None) |
87 |
| - |
88 |
| - # pandoc-citeproc error |
89 |
| - # failed to parse field issued: Could not read as string: Null |
| 33 | + csl_item['type'] = citeproc_type_fixer.get(csl_item['type'], csl_item['type']) |
| 34 | + |
| 35 | + if prune: |
| 36 | + # Remove fields that violate the CSL Item JSON Schema |
| 37 | + csl_item, = remove_jsonschema_errors([csl_item]) |
| 38 | + |
| 39 | + # Default CSL type to entry |
| 40 | + csl_item['type'] = csl_item.get('type', 'entry') |
| 41 | + |
| 42 | + if prune: |
| 43 | + # Confirm that corrected CSL validates |
| 44 | + validator = get_jsonschema_csl_validator() |
| 45 | + validator.validate([csl_item]) |
| 46 | + return csl_item |
| 47 | + |
| 48 | + |
| 49 | +@functools.lru_cache() |
| 50 | +def get_jsonschema_csl_validator(): |
| 51 | + """ |
| 52 | + Return a jsonschema validator for the CSL Item JSON Schema |
| 53 | + """ |
| 54 | + url = 'https://github.com/dhimmel/schema/raw/manubot/csl-data.json' |
| 55 | + # Use jsonref to workaround https://github.com/Julian/jsonschema/issues/447 |
| 56 | + schema = jsonref.load_uri(url, jsonschema=True) |
| 57 | + Validator = jsonschema.validators.validator_for(schema) |
| 58 | + Validator.check_schema(schema) |
| 59 | + return Validator(schema) |
| 60 | + |
| 61 | + |
| 62 | +def remove_jsonschema_errors(instance): |
| 63 | + """ |
| 64 | + Remove fields in CSL Items that produce JSON Schema errors. Note that this |
| 65 | + method may not be work for all types of JSON Schema errors and users |
| 66 | + looking to adapt it for other applications should write task-specific tests |
| 67 | + to provide empirical evaluate that it works as intended. |
| 68 | +
|
| 69 | + See also: |
| 70 | + https://github.com/Julian/jsonschema/issues/448 |
| 71 | + https://stackoverflow.com/questions/44694835 |
| 72 | + """ |
| 73 | + validator = get_jsonschema_csl_validator() |
| 74 | + errors = list(validator.iter_errors(instance)) |
| 75 | + instance = copy.deepcopy(instance) |
| 76 | + errors = sorted(errors, key=lambda e: e.path, reverse=True) |
| 77 | + for error in errors: |
| 78 | + _remove_error(instance, error) |
| 79 | + return instance |
| 80 | + |
| 81 | + |
| 82 | +def _delete_elem(instance, path, absolute_path=None, message=''): |
| 83 | + """ |
| 84 | + Helper function for remove_jsonschema_errors that deletes an element in the |
| 85 | + JSON-like input instance at the specified path. absolute_path is relative |
| 86 | + to the original validated instance for logging purposes. Defaults to path, |
| 87 | + if not specified. message is an optional string with additional error |
| 88 | + information to log. |
| 89 | + """ |
| 90 | + if absolute_path is None: |
| 91 | + absolute_path = path |
| 92 | + logging.debug( |
| 93 | + (f'{message}\n' if message else message) + |
| 94 | + '_delete_elem deleting CSL element at: ' + |
| 95 | + '/'.join(map(str, absolute_path)) |
| 96 | + ) |
| 97 | + *head, tail = path |
90 | 98 | try:
|
91 |
| - value = csl_item['issued']['date-parts'][0][0] |
92 |
| - if value is None: |
93 |
| - del csl_item['issued'] |
| 99 | + del _deep_get(instance, head)[tail] |
94 | 100 | except KeyError:
|
95 | 101 | pass
|
96 | 102 |
|
97 |
| - return csl_item |
| 103 | + |
| 104 | +def _deep_get(instance, path): |
| 105 | + """ |
| 106 | + Descend path to return a deep element in the JSON object instance. |
| 107 | + """ |
| 108 | + for key in path: |
| 109 | + instance = instance[key] |
| 110 | + return instance |
| 111 | + |
| 112 | + |
| 113 | +def _remove_error(instance, error): |
| 114 | + """ |
| 115 | + Remove a jsonschema ValidationError from the JSON-like instance. |
| 116 | +
|
| 117 | + See ValidationError documentation at |
| 118 | + http://python-jsonschema.readthedocs.io/en/latest/errors/#jsonschema.exceptions.ValidationError |
| 119 | + """ |
| 120 | + sub_errors = error.context |
| 121 | + if sub_errors: |
| 122 | + # already_removed_additional was neccessary to workaround https://github.com/citation-style-language/schema/issues/154 |
| 123 | + already_removed_additional = False |
| 124 | + for sub_error in sub_errors: |
| 125 | + if sub_error.validator == 'additionalProperties': |
| 126 | + if already_removed_additional: |
| 127 | + continue |
| 128 | + already_removed_additional = True |
| 129 | + sub_instance = _deep_get(instance, error.path) |
| 130 | + _remove_error(sub_instance, sub_error) |
| 131 | + elif error.validator == 'additionalProperties': |
| 132 | + extras = set(error.instance) - set(error.schema['properties']) |
| 133 | + logging.debug( |
| 134 | + error.message + |
| 135 | + f'\nWill now remove these {len(extras)} additional properties.' |
| 136 | + ) |
| 137 | + for key in extras: |
| 138 | + _delete_elem( |
| 139 | + instance=instance, |
| 140 | + path=list(error.path) + [key], |
| 141 | + absolute_path=list(error.absolute_path) + [key] |
| 142 | + ) |
| 143 | + elif error.validator in {'enum', 'type'}: |
| 144 | + _delete_elem(instance, error.path, error.absolute_path, error.message) |
| 145 | + else: |
| 146 | + raise NotImplementedError(f'{error.validator} is not yet supported') |
0 commit comments