Skip to content
This repository was archived by the owner on Sep 16, 2022. It is now read-only.

Commit 6fceb94

Browse files
committed
CSCMETAX-280:[ADD] datacatalog harvesting
1 parent 619b05f commit 6fceb94

File tree

3 files changed

+135
-47
lines changed

3 files changed

+135
-47
lines changed

src/metax_api/api/oaipmh/base/metax_oai_server.py

+68-37
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from oaipmh.error import IdDoesNotExistError
88
from oaipmh.error import BadArgumentError
99

10-
from metax_api.models.catalog_record import CatalogRecord
10+
from metax_api.models.catalog_record import CatalogRecord, DataCatalog
1111
from metax_api.services import CatalogRecordService as CRS
1212

1313
syke_url_prefix_template = 'http://metatieto.ymparisto.fi:8080/geoportal/catalog/search/resource/details.page?uuid=%s'
@@ -16,7 +16,7 @@
1616
class MetaxOAIServer(ResumptionOAIPMH):
1717

1818
def _is_valid_set(self, set):
19-
if not set or set == 'urnresolver' or set in settings.OAI['SET_MAPPINGS']:
19+
if not set or set in ['urnresolver', 'datacatalogs'] or set in settings.OAI['SET_MAPPINGS']:
2020
return True
2121
return False
2222

@@ -32,16 +32,20 @@ def _get_filtered_records(self, set, cursor, batch_size, from_=None, until=None)
3232
if not self._is_valid_set(set):
3333
raise BadArgumentError('invalid set value')
3434

35-
query_set = CatalogRecord.objects.all()
35+
proxy = CatalogRecord
36+
if set == 'datacatalogs':
37+
proxy = DataCatalog
38+
39+
query_set = proxy.objects.all()
3640
if from_ and until:
37-
query_set = CatalogRecord.objects.filter(date_modified__gte=from_, date_modified__lte=until)
41+
query_set = proxy.objects.filter(date_modified__gte=from_, date_modified__lte=until)
3842
elif from_:
39-
query_set = CatalogRecord.objects.filter(date_modified__gte=from_)
43+
query_set = proxy.objects.filter(date_modified__gte=from_)
4044
elif until:
41-
query_set = CatalogRecord.objects.filter(date_modified__lte=until)
45+
query_set = proxy.objects.filter(date_modified__lte=until)
4246

4347
if set:
44-
if set == 'urnresolver':
48+
if set in ['urnresolver', 'datacatalogs']:
4549
pass
4650
else:
4751
query_set = query_set.filter(
@@ -98,72 +102,76 @@ def _get_oaic_dc_value(self, value, lang=None):
98102
valueDict['lang'] = lang
99103
return valueDict
100104

101-
def _get_oai_dc_metadata(self, record):
102-
identifier = self._get_oaic_dc_value(record.research_dataset.get('preferred_identifier'))
105+
def _get_oai_dc_metadata(self, record, json, type):
106+
identifier = []
107+
if 'preferred_identifier' in json:
108+
identifier.append(self._get_oaic_dc_value(json.get('preferred_identifier')))
109+
if 'identifier' in json:
110+
identifier.append(self._get_oaic_dc_value(json.get('identifier')))
103111

104112
title = []
105-
title_data = record.research_dataset.get('title', {})
113+
title_data = json.get('title', {})
106114
for key, value in title_data.items():
107115
title.append(self._get_oaic_dc_value(value, key))
108116

109117
creator = []
110-
creator_data = record.research_dataset.get('creator', [])
118+
creator_data = json.get('creator', [])
111119
for value in creator_data:
112120
if 'name' in value:
113121
creator.append(self._get_oaic_dc_value(value.get('name')))
114122

115123
subject = []
116-
subject_data = record.research_dataset.get('keyword', [])
124+
subject_data = json.get('keyword', [])
117125
for value in subject_data:
118126
subject.append(self._get_oaic_dc_value(value))
119-
subject_data = record.research_dataset.get('field_of_science', [])
127+
subject_data = json.get('field_of_science', [])
120128
for value in subject_data:
121129
for key, value2 in value.get('pref_label', {}).items():
122130
subject.append(self._get_oaic_dc_value(value2, key))
123-
subject_data = record.research_dataset.get('theme', [])
131+
subject_data = json.get('theme', [])
124132
for value in subject_data:
125133
for key, value2 in value.get('pref_label', {}).items():
126134
subject.append(self._get_oaic_dc_value(value2, key))
127135

128136
desc = []
129-
desc_data = record.research_dataset.get('description', [])
137+
desc_data = json.get('description', [])
130138
for value in desc_data:
131139
for key, value2 in value.items():
132140
desc.append(self._get_oaic_dc_value(value2, key))
133141

134142
publisher = []
135-
publisher_data = record.research_dataset.get('publisher', {})
143+
publisher_data = json.get('publisher', {})
136144
for key, value in publisher_data.get('name', {}).items():
137145
publisher.append(self._get_oaic_dc_value(value))
138146

139147
contributor = []
140-
contributor_data = record.research_dataset.get('contributor', [])
148+
contributor_data = json.get('contributor', [])
141149
for value in contributor_data:
142150
if 'name' in value:
143151
contributor.append(self._get_oaic_dc_value(value.get('name')))
144152

145153
date = self._get_oaic_dc_value(str(record.date_created))
146154

147155
language = []
148-
language_data = record.research_dataset.get('language', [])
156+
language_data = json.get('language', [])
149157
for value in language_data:
150158
for key, value2 in value.items():
151159
language.append(self._get_oaic_dc_value(value2))
152160

153161
relation = []
154-
relation_data = record.research_dataset.get('relation', [])
162+
relation_data = json.get('relation', [])
155163
for value in relation_data:
156164
if 'identifier'in value.get('entity', {}):
157165
relation.append(self._get_oaic_dc_value(value['entity']['identifier']))
158166

159167
coverage = []
160-
coverage_data = record.research_dataset.get('spatial', [])
168+
coverage_data = json.get('spatial', [])
161169
for value in coverage_data:
162170
if 'geographic_name' in value:
163171
coverage.append(self._get_oaic_dc_value(value['geographic_name']))
164172

165173
rights = []
166-
rights_data = record.research_dataset.get('access_rights', {})
174+
rights_data = json.get('access_rights', {})
167175
for value in rights_data.get('description', []):
168176
for key, value2 in value.items():
169177
rights.append(self._get_oaic_dc_value(value2, key))
@@ -172,10 +180,10 @@ def _get_oai_dc_metadata(self, record):
172180
rights.append(self._get_oaic_dc_value(value['identifier']))
173181

174182
types = []
175-
types.append(self._get_oaic_dc_value('Dataset'))
183+
types.append(self._get_oaic_dc_value(type))
176184

177185
meta = {
178-
'identifier': [identifier],
186+
'identifier': identifier,
179187
'title': title,
180188
'creator': creator,
181189
'subject': subject,
@@ -202,16 +210,16 @@ def _get_oai_datacite_metadata(self, record):
202210
}
203211
return meta
204212

205-
def _get_metadata_for_record(self, record, metadata_prefix):
213+
def _get_metadata_for_record(self, record, json, type, metadata_prefix):
206214
meta = {}
207215

208216
# strip sensitive fields from research_dataset. note: the modified research_dataset
209217
# is placed back into the record's research_dataset -field. meaning, an accidental call
210218
# of record.save() would overwrite the original data
211-
record.research_dataset = CRS.strip_catalog_record(record.research_dataset)
219+
record.research_dataset = CRS.strip_catalog_record(json)
212220

213221
if metadata_prefix == 'oai_dc':
214-
meta = self._get_oai_dc_metadata(record)
222+
meta = self._get_oai_dc_metadata(record, json, type)
215223
elif metadata_prefix == 'oai_datacite':
216224
meta = self._get_oai_datacite_metadata(record)
217225
elif metadata_prefix == 'oai_dc_urnresolver':
@@ -226,9 +234,14 @@ def _get_header_timestamp(self, record):
226234
timestamp = record.date_created
227235
return timezone.make_naive(timestamp)
228236

229-
def _get_oai_item(self, record, metadata_prefix):
230-
identifier = record.identifier
231-
metadata = self._get_metadata_for_record(record, metadata_prefix)
237+
def _get_oai_item(self, identifier, record, metadata_prefix):
238+
metadata = self._get_metadata_for_record(record, record.research_dataset, 'Dataset', metadata_prefix)
239+
item = (common.Header('', identifier, self._get_header_timestamp(record), ['metax'], False),
240+
common.Metadata('', metadata), None)
241+
return item
242+
243+
def _get_oai_catalog_item(self, identifier, record, metadata_prefix):
244+
metadata = self._get_metadata_for_record(record, record.catalog_json, 'Datacatalog', metadata_prefix)
232245
item = (common.Header('', identifier, self._get_header_timestamp(record), ['metax'], False),
233246
common.Metadata('', metadata), None)
234247
return item
@@ -281,18 +294,24 @@ def listMetadataFormats(self, identifier=None):
281294

282295
def listSets(self, cursor=None, batch_size=None):
283296
"""Implement OAI-PMH verb ListSets."""
284-
data = []
297+
data = [('datacatalogs', 'datacatalog', '')]
285298
for set_key in settings.OAI['SET_MAPPINGS'].keys():
286299
data.append((set_key, set_key, ''))
287300
return data
288301

302+
def _get_record_identifier(self, record, set):
303+
if set == 'datacatalogs':
304+
return record.catalog_json['identifier']
305+
else:
306+
return record.identifier
307+
289308
def listIdentifiers(self, metadataPrefix=None, set=None, cursor=None,
290309
from_=None, until=None, batch_size=None):
291310
"""Implement OAI-PMH verb listIdentifiers."""
292311
records = self._get_filtered_records(set, cursor, batch_size, from_, until)
293312
data = []
294313
for record in records:
295-
identifier = record.research_dataset.get('preferred_identifier')
314+
identifier = self._get_record_identifier(record, set)
296315
data.append(common.Header('', identifier, self._get_header_timestamp(record), ['metax'], False))
297316
return data
298317

@@ -302,18 +321,30 @@ def listRecords(self, metadataPrefix=None, set=None, cursor=None, from_=None,
302321
data = []
303322
records = self._get_filtered_records(set, cursor, batch_size, from_, until)
304323
for record in records:
305-
data.append(self._get_oai_item(record, metadataPrefix))
324+
identifier = self._get_record_identifier(record, set)
325+
if set == 'datacatalogs':
326+
data.append(self._get_oai_catalog_item(identifier, record, metadataPrefix))
327+
else:
328+
data.append(self._get_oai_item(identifier, record, metadataPrefix))
306329
return data
307330

308331
def getRecord(self, metadataPrefix, identifier):
309332
"""Implement OAI-PMH verb GetRecord."""
310333
try:
311-
record = CatalogRecord.objects.get(
312-
# data_catalog__catalog_json__identifier__in=self._get_default_set_filter(),
313-
identifier__exact=identifier
314-
)
334+
if CatalogRecord.objects.filter(identifier__exact=identifier).exists():
335+
record = CatalogRecord.objects.get(identifier__exact=identifier)
336+
json = record.research_dataset
337+
type = 'Dataset'
338+
else:
339+
record = DataCatalog.objects.get(catalog_json__identifier__exact=identifier)
340+
json = record.catalog_json
341+
type = 'Datacatalog'
342+
315343
except CatalogRecord.DoesNotExist:
316344
raise IdDoesNotExistError("No dataset with id %s available through the OAI-PMH interface." % identifier)
317-
metadata = self._get_metadata_for_record(record, metadataPrefix)
345+
except DataCatalog.DoesNotExist:
346+
raise IdDoesNotExistError("No datacatalog with id %s available through the OAI-PMH interface." % identifier)
347+
348+
metadata = self._get_metadata_for_record(record, json, type, metadataPrefix)
318349
return (common.Header('', identifier, self._get_header_timestamp(record), ['metax'], False),
319350
common.Metadata('', metadata), None)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .minimal_api import *
2+
from .syke import *

0 commit comments

Comments
 (0)