Skip to content

adds a --cache-net-dir option to the tool to store requested schemas on disk if specified #100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ eggs/
*~
\#*\#
.desktop
.idea

7 changes: 6 additions & 1 deletion cwltool/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .cwlrdf import printrdf, printdot
from .process import shortname, Process
from .load_tool import fetch_document, validate_document, make_tool
from .urllib2_cache import set_cache
import schema_salad.validate as validate
import tempfile
import schema_salad.jsonld_context
Expand Down Expand Up @@ -148,6 +149,9 @@ def arg_parser(): # type: () -> argparse.ArgumentParser
parser.add_argument("workflow", type=str, nargs="?", default=None)
parser.add_argument("job_order", nargs=argparse.REMAINDER)

exgroup.add_argument("--cache-net-dir", type=str, default=None,
help="Cache directory to use for HTTP resources (e.g., schemas).")

return parser


Expand Down Expand Up @@ -569,12 +573,13 @@ def main(argsl=None,
return 1

try:
if args.cache_net_dir is not None:
set_cache(args.cache_net_dir)
document_loader, workflowobj, uri = fetch_document(args.workflow)

if args.print_deps:
printdeps(workflowobj, document_loader, stdout, args.relative_deps)
return 0

document_loader, avsc_names, processobj, metadata, uri \
= validate_document(document_loader, workflowobj, uri,
enable_dev=args.enable_dev, strict=args.strict,
Expand Down
4 changes: 2 additions & 2 deletions cwltool/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,13 +288,13 @@ def __init__(self, toolpath_object, **kwargs):
self.inputs_record_schema = schema_salad.schema.make_valid_avro(self.inputs_record_schema, {}, set())
avro.schema.make_avsc_object(self.inputs_record_schema, self.names)
except avro.schema.SchemaParseException as e:
raise validate.ValidationException(u"Got error `%s` while prcoessing inputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.inputs_record_schema, indent=4)))
raise validate.ValidationException(u"Got error `%s` while processing inputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.inputs_record_schema, indent=4)))

try:
self.outputs_record_schema = schema_salad.schema.make_valid_avro(self.outputs_record_schema, {}, set())
avro.schema.make_avsc_object(self.outputs_record_schema, self.names)
except avro.schema.SchemaParseException as e:
raise validate.ValidationException(u"Got error `%s` while prcoessing outputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.outputs_record_schema, indent=4)))
raise validate.ValidationException(u"Got error `%s` while processing outputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.outputs_record_schema, indent=4)))


def _init_job(self, joborder, **kwargs):
Expand Down
63 changes: 63 additions & 0 deletions cwltool/urllib2_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import urllib2
import hashlib
import urlparse, urllib
import os

cache_folder = None

def path2url(path):
return urlparse.urljoin(
'file:', urllib.pathname2url(path))

def set_cache(new_cache_folder):
global cache_folder
cached_handler = CachedHTTPHandler()
opener = urllib2.build_opener(cached_handler)
urllib2.install_opener(opener)
cache_folder = new_cache_folder
if not(os.path.exists(cache_folder)):
os.mkdir(cache_folder)


class CachedHTTPHandler(urllib2.AbstractHTTPHandler):

handler_order = 100

def http_open(self, req):
cache_path = self._get_local_cachefile_path(req.get_full_url())
cache_url = self._get_local_cachefile_url(req.get_full_url())
if os.path.exists(cache_path+'_redirect'):
redirected_url = open(cache_path+'_redirect','r').read()
cache_path = self._get_local_cachefile_path(redirected_url)
cache_url = self._get_local_cachefile_url(redirected_url)
if not(os.path.exists(cache_path)):
response = urllib2.HTTPHandler().http_open(req)
code = response.code
headers = response.headers
if code==200:
open(cache_path, 'w').write(response.read())
else:
if (code in (301, 302, 303, 307)):
if 'location' in headers:
newurl = headers.getheaders('location')[0]
elif 'uri' in headers:
newurl = headers.getheaders('uri')[0]
open(cache_path+'_redirect', 'w').write(newurl)
print newurl
#os.link(cache_path, self._get_local_cachefile_path(newurl))
return response
response = urllib2.FileHandler().file_open(urllib2.Request(cache_url))
response.code = 200
response.msg = "everything is ok"
return response

http_request = urllib2.AbstractHTTPHandler.do_request_

def _get_local_cachefile_name(self, url):
return hashlib.md5(url).hexdigest()

def _get_local_cachefile_path(self, url):
return os.path.abspath(os.path.join(cache_folder, self._get_local_cachefile_name(url)))

def _get_local_cachefile_url(self, url):
return path2url(self._get_local_cachefile_path(url))