Skip to content

Commit 4362261

Browse files
committed
SQUASH COMMITS from other PR
Prune CSL Items to validate JSON schema Refs manubot#47 CSL: replace arxiv_id with archive_location Travis: install package using pip Attempt to fix python-jsonschema/jsonschema#449 --prune-csl option for manubot cite Only remove a single additional property sub_error Workaround the effect of citation-style-language/schema#154 Switch to dhimmel/schema CSL JSON Move validation to remove_jsonschema_errors Test CSL pruning Improve CSL pruning documentation Default to pruning unless --bad-csl flag supplied DOI CSL retriever: use shortDOI for URL Switch CSL pruning logging to DEBUG Update manubot cite help in README arxiv citeproc: use int for date-parts
1 parent 4c40387 commit 4362261

File tree

21 files changed

+4801
-84
lines changed

21 files changed

+4801
-84
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ addons:
99
apt_packages:
1010
- pandoc
1111
install:
12-
- python setup.py install
12+
- pip install .
1313
- python setup.py sdist bdist_wheel
1414
script: py.test
1515
deploy:

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ manubot cite doi:10.1098/rsif.2017.0387 pmid:29424689 pmcid:PMC5640425 arxiv:180
9898
Additional usage information is available from `manubot cite --help`:
9999

100100
```
101-
usage: manubot cite [-h] [--file FILE] citations [citations ...]
101+
usage: manubot cite [-h] [--file FILE] [--bad-csl] citations [citations ...]
102102
103103
Retrieve bibliographic metadata for one or more citation identifiers.
104104
@@ -108,6 +108,8 @@ positional arguments:
108108
optional arguments:
109109
-h, --help show this help message and exit
110110
--file FILE specify a file to write CSL output, otherwise default to stdout
111+
--bad-csl allow CSL Items that do not conform to the JSON Schema. Skips
112+
CSL pruning.
111113
```
112114

113115
## Installation

manubot/cite/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_citation_id(standard_citation):
9999
return citation_id
100100

101101

102-
def citation_to_citeproc(citation):
102+
def citation_to_citeproc(citation, prune=True):
103103
"""
104104
Return a dictionary with citation metadata
105105
"""
@@ -113,7 +113,7 @@ def citation_to_citeproc(citation):
113113
raise ValueError(msg)
114114

115115
citation_id = get_citation_id(citation)
116-
citeproc = citeproc_passthrough(citeproc, set_id=citation_id)
116+
citeproc = citeproc_passthrough(citeproc, set_id=citation_id, prune=prune)
117117

118118
return citeproc
119119

@@ -130,6 +130,11 @@ def add_subparser_cite(subparsers):
130130
default=sys.stdout,
131131
help='specify a file to write CSL output, otherwise default to stdout',
132132
)
133+
parser.add_argument(
134+
'--bad-csl',
135+
action='store_true',
136+
help='allow CSL Items that do not conform to the JSON Schema. Skips CSL pruning.',
137+
)
133138
parser.add_argument(
134139
'citations',
135140
nargs='+',
@@ -143,7 +148,7 @@ def cli_cite(args):
143148
csl_list = list()
144149
for citation in args.citations:
145150
citation = standardize_citation(citation)
146-
csl_list.append(citation_to_citeproc(citation))
151+
csl_list.append(citation_to_citeproc(citation, prune=not args.bad_csl))
147152
with args.file as write_file:
148153
json.dump(csl_list, write_file, ensure_ascii=False, indent=2)
149154
write_file.write('\n')

manubot/cite/arxiv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def get_arxiv_citeproc(arxiv_id):
4646
pattern = re.compile(r'arxiv.org/abs/(.+)')
4747
match = pattern.search(url)
4848
versioned_id = match.group(1)
49-
csl_item['arxiv_id'] = versioned_id
49+
csl_item['archive_location'] = versioned_id
5050
_, csl_item['version'] = versioned_id.rsplit('v', 1)
5151
csl_item['URL'] = 'https://arxiv.org/abs/' + versioned_id
5252

@@ -56,7 +56,7 @@ def get_arxiv_citeproc(arxiv_id):
5656
# Extract CSL date field
5757
published = entry.findtext(prefix + 'published')
5858
published, _ = published.split('T', 1)
59-
csl_item['issued'] = {'date-parts': [published.split('-')]}
59+
csl_item['issued'] = {'date-parts': [[int(x) for x in published.split('-')]]}
6060

6161
# Extract authors
6262
authors = list()

manubot/cite/citeproc.py

Lines changed: 123 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,9 @@
1-
# Valid CSL (citeproc JSON) types as per
2-
# https://github.com/citation-style-language/schema/blob/4846e02f0a775a8272819204379a4f8d7f45c16c/csl-types.rnc#L5-L39
3-
citeproc_types = {
4-
"article",
5-
"article-journal",
6-
"article-magazine",
7-
"article-newspaper",
8-
"bill",
9-
"book",
10-
"broadcast",
11-
"chapter",
12-
"dataset",
13-
"entry",
14-
"entry-dictionary",
15-
"entry-encyclopedia",
16-
"figure",
17-
"graphic",
18-
"interview",
19-
"legal_case",
20-
"legislation",
21-
"manuscript",
22-
"map",
23-
"motion_picture",
24-
"musical_score",
25-
"pamphlet",
26-
"paper-conference",
27-
"patent",
28-
"personal_communication",
29-
"post",
30-
"post-weblog",
31-
"report",
32-
"review",
33-
"review-book",
34-
"song",
35-
"speech",
36-
"thesis",
37-
"treaty",
38-
"webpage",
39-
}
1+
import copy
2+
import functools
3+
import logging
4+
5+
import jsonref
6+
import jsonschema
407

418
citeproc_type_fixer = {
429
'journal-article': 'article-journal',
@@ -47,51 +14,133 @@
4714
'reference-entry': 'entry',
4815
}
4916

50-
# Remove citeproc keys to fix pandoc-citeproc errors
51-
citeproc_remove_keys = [
52-
# Error in $[0].ISSN[0]: failed to parse field ISSN: mempty
53-
'ISSN',
54-
# Error in $[2].ISBN[0]: failed to parse field ISBN: mempty
55-
'ISBN',
56-
# pandoc-citeproc expected Object not array for archive
57-
'archive',
58-
# failed to parse field event: Could not read as string
59-
'event',
60-
# remove the references of cited papers. Not neccessary and unwieldy.
61-
'reference',
62-
# Error in $[26].categories[0][0]: failed to parse field categories: mempty
63-
'categories',
64-
]
65-
66-
67-
def citeproc_passthrough(csl_item, set_id=None):
17+
18+
def citeproc_passthrough(csl_item, set_id=None, prune=True):
6819
"""
69-
Fix errors in a CSL item and optionally change its id.
20+
Fix errors in a CSL item, according to the CSL JSON schema, and optionally
21+
change its id.
22+
23+
http://docs.citationstyles.org/en/1.0.1/specification.html
7024
http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html
7125
https://github.com/citation-style-language/schema/blob/master/csl-data.json
7226
"""
7327
if set_id is not None:
7428
csl_item['id'] = set_id
29+
logging.debug(f"Starting citeproc_passthrough with{'' if prune else 'out'} CSL pruning for id: {csl_item.get('id', 'id not specified')}")
7530

7631
# Correct invalid CSL item types
7732
# See https://github.com/CrossRef/rest-api-doc/issues/187
78-
old_type = csl_item['type']
79-
csl_type = citeproc_type_fixer.get(old_type, old_type)
80-
if csl_type not in citeproc_types:
81-
csl_type = 'entry'
82-
csl_item['type'] = csl_type
83-
84-
# Remove problematic objects
85-
for key in citeproc_remove_keys:
86-
csl_item.pop(key, None)
87-
88-
# pandoc-citeproc error
89-
# failed to parse field issued: Could not read as string: Null
33+
csl_item['type'] = citeproc_type_fixer.get(csl_item['type'], csl_item['type'])
34+
35+
if prune:
36+
# Remove fields that violate the CSL Item JSON Schema
37+
csl_item, = remove_jsonschema_errors([csl_item])
38+
39+
# Default CSL type to entry
40+
csl_item['type'] = csl_item.get('type', 'entry')
41+
42+
if prune:
43+
# Confirm that corrected CSL validates
44+
validator = get_jsonschema_csl_validator()
45+
validator.validate([csl_item])
46+
return csl_item
47+
48+
49+
@functools.lru_cache()
50+
def get_jsonschema_csl_validator():
51+
"""
52+
Return a jsonschema validator for the CSL Item JSON Schema
53+
"""
54+
url = 'https://github.com/dhimmel/schema/raw/manubot/csl-data.json'
55+
# Use jsonref to workaround https://github.com/Julian/jsonschema/issues/447
56+
schema = jsonref.load_uri(url, jsonschema=True)
57+
Validator = jsonschema.validators.validator_for(schema)
58+
Validator.check_schema(schema)
59+
return Validator(schema)
60+
61+
62+
def remove_jsonschema_errors(instance):
63+
"""
64+
Remove fields in CSL Items that produce JSON Schema errors. Note that this
65+
method may not be work for all types of JSON Schema errors and users
66+
looking to adapt it for other applications should write task-specific tests
67+
to provide empirical evaluate that it works as intended.
68+
69+
See also:
70+
https://github.com/Julian/jsonschema/issues/448
71+
https://stackoverflow.com/questions/44694835
72+
"""
73+
validator = get_jsonschema_csl_validator()
74+
errors = list(validator.iter_errors(instance))
75+
instance = copy.deepcopy(instance)
76+
errors = sorted(errors, key=lambda e: e.path, reverse=True)
77+
for error in errors:
78+
_remove_error(instance, error)
79+
return instance
80+
81+
82+
def _delete_elem(instance, path, absolute_path=None, message=''):
83+
"""
84+
Helper function for remove_jsonschema_errors that deletes an element in the
85+
JSON-like input instance at the specified path. absolute_path is relative
86+
to the original validated instance for logging purposes. Defaults to path,
87+
if not specified. message is an optional string with additional error
88+
information to log.
89+
"""
90+
if absolute_path is None:
91+
absolute_path = path
92+
logging.debug(
93+
(f'{message}\n' if message else message) +
94+
'_delete_elem deleting CSL element at: ' +
95+
'/'.join(map(str, absolute_path))
96+
)
97+
*head, tail = path
9098
try:
91-
value = csl_item['issued']['date-parts'][0][0]
92-
if value is None:
93-
del csl_item['issued']
99+
del _deep_get(instance, head)[tail]
94100
except KeyError:
95101
pass
96102

97-
return csl_item
103+
104+
def _deep_get(instance, path):
105+
"""
106+
Descend path to return a deep element in the JSON object instance.
107+
"""
108+
for key in path:
109+
instance = instance[key]
110+
return instance
111+
112+
113+
def _remove_error(instance, error):
114+
"""
115+
Remove a jsonschema ValidationError from the JSON-like instance.
116+
117+
See ValidationError documentation at
118+
http://python-jsonschema.readthedocs.io/en/latest/errors/#jsonschema.exceptions.ValidationError
119+
"""
120+
sub_errors = error.context
121+
if sub_errors:
122+
# already_removed_additional was neccessary to workaround https://github.com/citation-style-language/schema/issues/154
123+
already_removed_additional = False
124+
for sub_error in sub_errors:
125+
if sub_error.validator == 'additionalProperties':
126+
if already_removed_additional:
127+
continue
128+
already_removed_additional = True
129+
sub_instance = _deep_get(instance, error.path)
130+
_remove_error(sub_instance, sub_error)
131+
elif error.validator == 'additionalProperties':
132+
extras = set(error.instance) - set(error.schema['properties'])
133+
logging.debug(
134+
error.message +
135+
f'\nWill now remove these {len(extras)} additional properties.'
136+
)
137+
for key in extras:
138+
_delete_elem(
139+
instance=instance,
140+
path=list(error.path) + [key],
141+
absolute_path=list(error.absolute_path) + [key]
142+
)
143+
elif error.validator in {'enum', 'type'}:
144+
_delete_elem(instance, error.path, error.absolute_path, error.message)
145+
else:
146+
raise NotImplementedError(f'{error.validator} is not yet supported')

manubot/cite/doi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def get_short_doi_url(doi):
1919
short_url = 'https://doi.org/' + short_doi
2020
return short_url
2121
except Exception:
22-
logging.exception(f'shortDOI lookup failed for {doi}')
22+
logging.warning(f'shortDOI lookup failed for {doi}', exc_info=True)
2323
return None
2424

2525

@@ -42,7 +42,7 @@ def get_doi_citeproc(doi):
4242
citeproc['URL'] = f'https://doi.org/{doi}'
4343
short_doi_url = get_short_doi_url(doi)
4444
if short_doi_url:
45-
citeproc['short_url'] = short_doi_url
45+
citeproc['URL'] = short_doi_url
4646
try:
4747
citeproc.update(get_pubmed_ids_for_doi(doi))
4848
except Exception:

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
install_requires=[
5151
'errorhandler',
5252
'jinja2',
53+
'jsonref',
54+
'jsonschema',
5355
'pandas',
5456
'pybase62',
5557
'pyyaml',

0 commit comments

Comments
 (0)