From f9faff289e2b13b8ca9c9ab69627c3297f2121c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20M=C3=A9nager?= Date: Fri, 17 Jun 2016 14:13:00 +0200 Subject: [PATCH 1/4] ignore pycharm files --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ef302c113..523c0d55b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ eggs/ *~ \#*\# .desktop +.idea + From b9d01d16157fa21e2ac04e240907b96aeb5e6ec9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20M=C3=A9nager?= Date: Fri, 17 Jun 2016 14:15:02 +0200 Subject: [PATCH 2/4] first take at providing a cache mechanism for imported schemas This adds a --cache-net-dir option to the tool, which is a path to a cache directory that will be used to store the requested schemas on disk if specified. It works by adding a handler to the urllib2 library, that handles the opening of HTTP URLs and first checks that they are stored in the directory: - if present, contents is directly read from the stored file - if not present, HTTP request is actually issued and the contents are stored on file as well as sent back to the user. --- cwltool/main.py | 7 ++++- cwltool/net_cache.py | 63 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 cwltool/net_cache.py diff --git a/cwltool/main.py b/cwltool/main.py index fc0e814d0..62fd7cbb9 100755 --- a/cwltool/main.py +++ b/cwltool/main.py @@ -15,6 +15,7 @@ from .cwlrdf import printrdf, printdot from .process import shortname, Process from .load_tool import fetch_document, validate_document, make_tool +from .net_cache import set_cache import schema_salad.validate as validate import tempfile import schema_salad.jsonld_context @@ -148,6 +149,9 @@ def arg_parser(): # type: () -> argparse.ArgumentParser parser.add_argument("workflow", type=str, nargs="?", default=None) parser.add_argument("job_order", nargs=argparse.REMAINDER) + exgroup.add_argument("--cache-net-dir", type=str, default=None, + help="Cache directory to use for HTTP resources (e.g., schemas).") + return parser @@ -569,12 +573,13 @@ def main(argsl=None, return 1 try: + if args.cache_net_dir is not None: + set_cache(args.cache_net_dir) document_loader, workflowobj, uri = fetch_document(args.workflow) if args.print_deps: printdeps(workflowobj, document_loader, stdout, args.relative_deps) return 0 - document_loader, avsc_names, processobj, metadata, uri \ = validate_document(document_loader, workflowobj, uri, enable_dev=args.enable_dev, strict=args.strict, diff --git a/cwltool/net_cache.py b/cwltool/net_cache.py new file mode 100644 index 000000000..35e0d3cc0 --- /dev/null +++ b/cwltool/net_cache.py @@ -0,0 +1,63 @@ +import urllib2 +import hashlib +import urlparse, urllib +import os + +cache_folder = None + +def path2url(path): + return urlparse.urljoin( + 'file:', urllib.pathname2url(path)) + +def set_cache(new_cache_folder): + global cache_folder + cached_handler = CachedHTTPHandler() + opener = urllib2.build_opener(cached_handler) + urllib2.install_opener(opener) + cache_folder = new_cache_folder + if not(os.path.exists(cache_folder)): + os.mkdir(cache_folder) + + +class CachedHTTPHandler(urllib2.AbstractHTTPHandler): + + handler_order = 100 + + def http_open(self, req): + cache_path = self._get_local_cachefile_path(req.get_full_url()) + cache_url = self._get_local_cachefile_url(req.get_full_url()) + if os.path.exists(cache_path+'_redirect'): + redirected_url = open(cache_path+'_redirect','r').read() + cache_path = self._get_local_cachefile_path(redirected_url) + cache_url = self._get_local_cachefile_url(redirected_url) + if not(os.path.exists(cache_path)): + response = urllib2.HTTPHandler().http_open(req) + code = response.code + headers = response.headers + if code==200: + open(cache_path, 'w').write(response.read()) + else: + if (code in (301, 302, 303, 307)): + if 'location' in headers: + newurl = headers.getheaders('location')[0] + elif 'uri' in headers: + newurl = headers.getheaders('uri')[0] + open(cache_path+'_redirect', 'w').write(newurl) + print newurl + #os.link(cache_path, self._get_local_cachefile_path(newurl)) + return response + response = urllib2.FileHandler().file_open(urllib2.Request(cache_url)) + response.code = 200 + response.msg = "everything is ok" + return response + + http_request = urllib2.AbstractHTTPHandler.do_request_ + + def _get_local_cachefile_name(self, url): + return hashlib.md5(url).hexdigest() + + def _get_local_cachefile_path(self, url): + return os.path.abspath(os.path.join(cache_folder, self._get_local_cachefile_name(url))) + + def _get_local_cachefile_url(self, url): + return path2url(self._get_local_cachefile_path(url)) \ No newline at end of file From 678c6f81d82a9b00ae4f0325533d9a95350bf916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20M=C3=A9nager?= Date: Fri, 17 Jun 2016 14:25:11 +0200 Subject: [PATCH 3/4] rename net_cache to urllib2_cache which is more descriptive. --- cwltool/main.py | 2 +- cwltool/{net_cache.py => urllib2_cache.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cwltool/{net_cache.py => urllib2_cache.py} (100%) diff --git a/cwltool/main.py b/cwltool/main.py index 62fd7cbb9..dded1541c 100755 --- a/cwltool/main.py +++ b/cwltool/main.py @@ -15,7 +15,7 @@ from .cwlrdf import printrdf, printdot from .process import shortname, Process from .load_tool import fetch_document, validate_document, make_tool -from .net_cache import set_cache +from .urllib2_cache import set_cache import schema_salad.validate as validate import tempfile import schema_salad.jsonld_context diff --git a/cwltool/net_cache.py b/cwltool/urllib2_cache.py similarity index 100% rename from cwltool/net_cache.py rename to cwltool/urllib2_cache.py From 28a2266c90a047195990c2584ceab1b0bdaaf38a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20M=C3=A9nager?= Date: Fri, 17 Jun 2016 16:17:21 +0200 Subject: [PATCH 4/4] typo correction in error message --- cwltool/process.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cwltool/process.py b/cwltool/process.py index e742db45f..e5775438f 100644 --- a/cwltool/process.py +++ b/cwltool/process.py @@ -288,13 +288,13 @@ def __init__(self, toolpath_object, **kwargs): self.inputs_record_schema = schema_salad.schema.make_valid_avro(self.inputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.inputs_record_schema, self.names) except avro.schema.SchemaParseException as e: - raise validate.ValidationException(u"Got error `%s` while prcoessing inputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.inputs_record_schema, indent=4))) + raise validate.ValidationException(u"Got error `%s` while processing inputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.inputs_record_schema, indent=4))) try: self.outputs_record_schema = schema_salad.schema.make_valid_avro(self.outputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.outputs_record_schema, self.names) except avro.schema.SchemaParseException as e: - raise validate.ValidationException(u"Got error `%s` while prcoessing outputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.outputs_record_schema, indent=4))) + raise validate.ValidationException(u"Got error `%s` while processing outputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.outputs_record_schema, indent=4))) def _init_job(self, joborder, **kwargs):