Skip to content

[SHARE-739][Improvement] Check quality of OAI sources #660

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion project/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,12 @@
'RawData Janitor': {
'task': 'share.janitor.tasks.rawdata_janitor',
'schedule': crontab(minute=0) # hourly
}
},
'Source Stats': {
'task': 'share.tasks.source_stats',
'schedule': crontab(minute=0, hour='3,9,15,21'), # every 6 hours
'args': (),
},
}

CELERY_RESULT_EXPIRES = 60 * 60 * 24 * 3 # 4 days
Expand Down
32 changes: 32 additions & 0 deletions share/admin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from share.models.logs import HarvestLog
from share.models.meta import Subject, SubjectTaxonomy
from share.models.registration import ProviderRegistration
from share.models.sources import SourceStat


admin.site.register(AbstractCreativeWork, CreativeWorkAdmin)
Expand Down Expand Up @@ -347,6 +348,36 @@ def recursive_link_list(subjects):
subject_links.short_description = 'Subjects'


class SourceStatAdmin(admin.ModelAdmin):
search_fields = ('config__label', 'config__source__long_title')
list_display = ('label', 'date_created', 'base_urls_match', 'earliest_datestamps_match', 'response_elapsed_time', 'response_status_code', 'grade_')
list_filter = ('grade', 'response_status_code', 'config__label')

GRADE_COLORS = {
0: 'red',
5: 'orange',
10: 'green',
}
GRADE_LETTERS = {
0: 'F',
5: 'C',
10: 'A',
}

def source(self, obj):
return obj.config.source.long_title

def label(self, obj):
return obj.config.label

def grade_(self, obj):
return format_html(
'<span style="font-weight: bold; color: {}">{}</span>',
self.GRADE_COLORS[obj.grade],
self.GRADE_LETTERS[obj.grade],
)


admin.site.unregister(AccessToken)
admin.site.register(AccessToken, AccessTokenAdmin)

Expand All @@ -362,4 +393,5 @@ def recursive_link_list(subjects):
admin.site.register(Source, SourceAdmin)
admin.site.register(SourceConfig, SourceConfigAdmin)
admin.site.register(SubjectTaxonomy, SubjectTaxonomyAdmin)
admin.site.register(SourceStat, SourceStatAdmin)
admin.site.register(Transformer)
36 changes: 36 additions & 0 deletions share/migrations/0039_sourcestat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.1 on 2017-06-13 17:46
from __future__ import unicode_literals

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('share', '0038_auto_20170606_1857'),
]

operations = [
migrations.CreateModel(
name='SourceStat',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('is_deleted', models.BooleanField(default=False)),
('date_created', models.DateTimeField(auto_now_add=True)),
('response_status_code', models.SmallIntegerField(blank=True, null=True)),
('response_elapsed_time', models.FloatField(blank=True, null=True)),
('response_exception', models.TextField(blank=True, null=True)),
('earliest_datestamp_config', models.DateField(blank=True, null=True)),
('base_url_config', models.TextField()),
('admin_note', models.TextField(blank=True)),
('grade', models.FloatField()),
('earliest_datestamp_source', models.DateField(blank=True, null=True)),
('earliest_datestamps_match', models.BooleanField(default=False)),
('base_url_source', models.TextField(blank=True, null=True)),
('base_urls_match', models.BooleanField(default=False)),
('config', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='share.SourceConfig')),
],
),
]
16 changes: 16 additions & 0 deletions share/migrations/0043_merge_20170626_1516.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.1 on 2017-06-26 15:16
from __future__ import unicode_literals

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('share', '0042_merge_20170620_1330'),
('share', '0039_sourcestat'),
]

operations = [
]
7 changes: 4 additions & 3 deletions share/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from share.models.registration import * # noqa
from share.models.identifiers import * # noqa
from share.models.relations import * # noqa
from share.models.banner import * # noqa
from share.models.ingest import * # noqa
from share.models.logs import * # noqa
from share.models.banner import * # noqa
from share.models.ingest import * # noqa
from share.models.logs import * # noqa
from share.models.sources import * # noqa
from share.models.celery import * # noqa
34 changes: 34 additions & 0 deletions share/models/sources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import logging

from django.db import models

from share.models.ingest import SourceConfig

logger = logging.getLogger(__name__)
__all__ = ('SourceStat',)


class SourceStat(models.Model):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a grade or ok field, so we can filter for sources that are having issues?

config = models.ForeignKey(SourceConfig, on_delete=models.CASCADE)
is_deleted = models.BooleanField(default=False)
date_created = models.DateTimeField(auto_now_add=True)
response_status_code = models.SmallIntegerField(blank=True, null=True)
response_elapsed_time = models.FloatField(blank=True, null=True)
response_exception = models.TextField(blank=True, null=True)
earliest_datestamp_config = models.DateField(blank=True, null=True)
base_url_config = models.TextField()
admin_note = models.TextField(blank=True)
grade = models.FloatField()

# OAI specific
earliest_datestamp_source = models.DateField(blank=True, null=True)
earliest_datestamps_match = models.BooleanField(default=False)

base_url_source = models.TextField(blank=True, null=True)
base_urls_match = models.BooleanField(default=False)

def __repr__(self):
return '<{}({}, {})>'.format(self.__class__.__name__, self.pk, self.config.label)

def __str__(self):
return '{}: {}'.format(self.config.source.long_title, self.config.label)
3 changes: 2 additions & 1 deletion share/sources/au.uow/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://ro.uow.edu.au/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: au.uow
Expand Down
18 changes: 9 additions & 9 deletions share/sources/be.ghent/source.yaml
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
configs:
- base_url: https://biblio.ugent.be/oai
disabled: true
earliest_date: null
disabled: false
earliest_date: 2016-12-14T15:38:10Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: be.ghent
harvester_kwargs: {metadata_prefix: mods}
label: be.ghent.mods
rate_limit_allowance: 1
rate_limit_period: 2
transformer: oai_dc
transformer: mods
transformer_kwargs:
approved_sets: null
emitted_type: CreativeWork
property_list: []
type_map: {}
- base_url: https://biblio.ugent.be/oai
disabled: false
disabled: true
earliest_date: null
harvester: oai
harvester_kwargs: {metadata_prefix: mods}
label: be.ghent.mods
harvester_kwargs: {metadata_prefix: oai_dc}
label: be.ghent
rate_limit_allowance: 1
rate_limit_period: 2
transformer: mods
transformer: oai_dc
transformer_kwargs:
approved_sets: null
emitted_type: CreativeWork
Expand Down
4 changes: 2 additions & 2 deletions share/sources/br.pcurio/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
configs:
- base_url: http://www.maxwell.vrac.puc-rio.br/DC_Todos.php
- base_url: https://www.maxwell.vrac.puc-rio.br/DC_Todos.php
disabled: false
earliest_date: null
earliest_date: null # earliestDatestamp is earliest published
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc, time_granularity: false}
label: br.pcurio
Expand Down
3 changes: 2 additions & 1 deletion share/sources/ca.umontreal/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# DSpace
configs:
- base_url: http://papyrus.bib.umontreal.ca/oai/request
disabled: false
earliest_date: null
earliest_date: 2005-05-18T18:27:23Z
harvester: oai
harvester_kwargs: {metadata_prefix: mods}
label: ca.umontreal.mods
Expand Down
3 changes: 2 additions & 1 deletion share/sources/ca.uwo/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://ir.lib.uwo.ca/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: ca.uwo
Expand Down
4 changes: 2 additions & 2 deletions share/sources/ch.cern/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
configs:
- base_url: http://cdsweb.cern.ch/oai2d/
- base_url: http://cds.cern.ch/oai2d
disabled: false
earliest_date: null
earliest_date: 2003-06-02T08:06:23Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: ch.cern
Expand Down
2 changes: 1 addition & 1 deletion share/sources/com.nature/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
configs:
- base_url: http://www.nature.com/oai/request
disabled: false
earliest_date: null
earliest_date: null # earliestDatestamp is earliest published
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc, time_granularity: false}
label: com.nature
Expand Down
2 changes: 1 addition & 1 deletion share/sources/edu.ageconsearch/source.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
configs:
- base_url: http://ageconsearch.umn.edu/browse-date
disabled: false
disabled: true
earliest_date: null
harvester: edu.ageconsearch
harvester_kwargs: {}
Expand Down
5 changes: 3 additions & 2 deletions share/sources/edu.asu/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Link in <location><url>
configs:
- base_url: http://repository.asu.edu/oai-pmh
- base_url: https://repository.asu.edu/oai-pmh
disabled: false
earliest_date: null
earliest_date: 2011-05-06T20:05:41Z
harvester: oai
harvester_kwargs: {metadata_prefix: mods}
label: edu.asu.mods
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.boise_state/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://scholarworks.boisestate.edu/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: edu.boise_state
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.bu.open/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# DSpace
configs:
- base_url: http://open.bu.edu/oai/request
disabled: false
earliest_date: null
earliest_date: 2005-08-12T20:32:45Z
harvester: oai
harvester_kwargs: {metadata_prefix: mods}
label: edu.bu.open
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.calhoun/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# DSpace
configs:
- base_url: http://calhoun.nps.edu/oai/request
disabled: false
earliest_date: null
earliest_date: 2012-03-14T16:52:21Z
harvester: oai
harvester_kwargs: {metadata_prefix: mods}
label: edu.calhoun.mods
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.calpoly/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://digitalcommons.calpoly.edu/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: edu.calpoly
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.caltech/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# links in <relation>
configs:
- base_url: http://authors.library.caltech.edu/cgi/oai2
disabled: false
earliest_date: null
earliest_date: 2011-01-12T00:11:25Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: edu.caltech
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.chapman/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://digitalcommons.chapman.edu/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: edu.chapman
Expand Down
2 changes: 1 addition & 1 deletion share/sources/edu.citeseerx/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
configs:
- base_url: http://citeseerx.ist.psu.edu/oai2
disabled: false
earliest_date: null
earliest_date: null # earliestDatestamp is earliest published
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc, time_granularity: false}
label: edu.citeseerx
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.cmu/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://repository.cmu.edu/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: edu.cmu
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.colostate/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# DSpace
configs:
- base_url: https://dspace.library.colostate.edu/oai/request
disabled: false
earliest_date: null
earliest_date: 2007-01-01T06:33:28Z
harvester: oai
harvester_kwargs: {metadata_prefix: mods}
label: edu.colostate.mods
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.cornell/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# DSpace
configs:
- base_url: https://ecommons.cornell.edu/dspace-oai/request
disabled: false
earliest_date: null
earliest_date: 2002-11-12T17:55:14Z
harvester: oai
harvester_kwargs: {metadata_prefix: mods}
label: edu.cornell.mods
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.csuohio/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://engagedscholarship.csuohio.edu/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: edu.csuohio
Expand Down
3 changes: 2 additions & 1 deletion share/sources/edu.cuny/source.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Digital Commons/Bepress
configs:
- base_url: http://academicworks.cuny.edu/do/oai/
disabled: false
earliest_date: null
earliest_date: 2000-01-19T00:00:00Z
harvester: oai
harvester_kwargs: {metadata_prefix: oai_dc}
label: edu.cuny
Expand Down
Loading