From 4600bbd5235a3204dc5f3bf5d497fc1f0a05578d Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Mon, 31 Jul 2017 18:44:36 +0530 Subject: [PATCH 1/8] avoid basedir append to http/https input file path --- cwltool/pathmapper.py | 11 ++++++----- cwltool/stdfsaccess.py | 2 ++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cwltool/pathmapper.py b/cwltool/pathmapper.py index 6802a91eb..377278abc 100644 --- a/cwltool/pathmapper.py +++ b/cwltool/pathmapper.py @@ -210,12 +210,13 @@ def visit(self, obj, stagedir, basedir, copy=False, staged=False): with SourceLine(obj, "location", validate.ValidationException): # Dereference symbolic links deref = ab - st = os.lstat(deref) - while stat.S_ISLNK(st.st_mode): - rl = os.readlink(deref) - deref = rl if os.path.isabs(rl) else os.path.join( - os.path.dirname(deref), rl) + if urllib.parse.urlsplit(deref).scheme not in ['http','https']: st = os.lstat(deref) + while stat.S_ISLNK(st.st_mode): + rl = os.readlink(deref) + deref = rl if os.path.isabs(rl) else os.path.join( + os.path.dirname(deref), rl) + st = os.lstat(deref) self._pathmap[path] = MapperEnt(deref, tgt, "WritableFile" if copy else "File", staged) self.visitlisting(obj.get("secondaryFiles", []), stagedir, basedir, copy=copy, staged=staged) diff --git a/cwltool/stdfsaccess.py b/cwltool/stdfsaccess.py index df5056b04..72016a861 100644 --- a/cwltool/stdfsaccess.py +++ b/cwltool/stdfsaccess.py @@ -13,6 +13,8 @@ def abspath(src, basedir): # type: (Text, Text) -> Text if src.startswith(u"file://"): ab = six.text_type(uri_file_path(str(src))) + elif urllib.parse.urlsplit(src).scheme in ['http','https']: + return src else: if basedir.startswith(u"file://"): ab = src if os.path.isabs(src) else basedir+ '/'+ src From 95098e31fb2c70f7704a61b1505a3f078b0a16db Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Fri, 4 Aug 2017 17:12:42 +0530 Subject: [PATCH 2/8] adding function to Download http/s files as input --- cwltool/pathmapper.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/cwltool/pathmapper.py b/cwltool/pathmapper.py index 377278abc..47618003c 100644 --- a/cwltool/pathmapper.py +++ b/cwltool/pathmapper.py @@ -5,6 +5,9 @@ import stat import uuid from functools import partial +from tempfile import NamedTemporaryFile + +import requests from typing import Any, Callable, Dict, Iterable, List, Set, Text, Tuple, Union import schema_salad.validate as validate @@ -139,6 +142,15 @@ def trim_listing(obj): if obj.get("location", "").startswith("file://") and "listing" in obj: del obj["listing"] +# Download http Files +def downloadHttpFile(httpurl): + r = requests.get(httpurl, stream=True) + with NamedTemporaryFile(mode='wb', delete=False) as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + r.close() + return f.name class PathMapper(object): """Mapping of files from relative path provided in the file to a tuple of @@ -208,15 +220,18 @@ def visit(self, obj, stagedir, basedir, copy=False, staged=False): self._pathmap[obj["location"]] = MapperEnt(obj["contents"], tgt, "CreateFile", staged) else: with SourceLine(obj, "location", validate.ValidationException): - # Dereference symbolic links deref = ab - if urllib.parse.urlsplit(deref).scheme not in ['http','https']: + if urllib.parse.urlsplit(deref).scheme in ['http','https']: + deref = downloadHttpFile(path) + else: + # Dereference symbolic links st = os.lstat(deref) while stat.S_ISLNK(st.st_mode): rl = os.readlink(deref) deref = rl if os.path.isabs(rl) else os.path.join( os.path.dirname(deref), rl) st = os.lstat(deref) + self._pathmap[path] = MapperEnt(deref, tgt, "WritableFile" if copy else "File", staged) self.visitlisting(obj.get("secondaryFiles", []), stagedir, basedir, copy=copy, staged=staged) From 0ae320934c04f3e62da4a6873d411b2f9b71db96 Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Fri, 4 Aug 2017 20:11:09 +0530 Subject: [PATCH 3/8] changing chunksize to 16 kb --- cwltool/pathmapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cwltool/pathmapper.py b/cwltool/pathmapper.py index 47618003c..2bda574f3 100644 --- a/cwltool/pathmapper.py +++ b/cwltool/pathmapper.py @@ -146,7 +146,7 @@ def trim_listing(obj): def downloadHttpFile(httpurl): r = requests.get(httpurl, stream=True) with NamedTemporaryFile(mode='wb', delete=False) as f: - for chunk in r.iter_content(chunk_size=1024): + for chunk in r.iter_content(chunk_size=16384): if chunk: # filter out keep-alive new chunks f.write(chunk) r.close() From 218f8b58cdc112b69dd671dbd050d3d1ffcb2aa9 Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Sun, 6 Aug 2017 14:58:40 +0530 Subject: [PATCH 4/8] Using cache-control to cache downloaded input files --- cwltool/pathmapper.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/cwltool/pathmapper.py b/cwltool/pathmapper.py index 2bda574f3..e3f91d36c 100644 --- a/cwltool/pathmapper.py +++ b/cwltool/pathmapper.py @@ -8,6 +8,8 @@ from tempfile import NamedTemporaryFile import requests +from cachecontrol import CacheControl +from cachecontrol.caches import FileCache from typing import Any, Callable, Dict, Iterable, List, Set, Text, Tuple, Union import schema_salad.validate as validate @@ -144,13 +146,28 @@ def trim_listing(obj): # Download http Files def downloadHttpFile(httpurl): - r = requests.get(httpurl, stream=True) - with NamedTemporaryFile(mode='wb', delete=False) as f: - for chunk in r.iter_content(chunk_size=16384): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - r.close() - return f.name + cache_session = None + if "HOME" in os.environ: + cache_session = CacheControl( + requests.Session(), + cache=FileCache( + os.path.join(os.environ["HOME"], ".cache", "cwltool"))) + elif "TMP" in os.environ: + cache_session = CacheControl( + requests.Session(), + cache=FileCache(os.path.join(os.environ["TMP"], ".cache", "cwltool"))) + else: + cache_session = CacheControl( + requests.Session(), + cache=FileCache("/tmp", ".cache", "cwltool")) + + r = cache_session.get(httpurl, stream=True) + with NamedTemporaryFile(mode='wb', delete=False) as f: + for chunk in r.iter_content(chunk_size=16384): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + r.close() + return f.name class PathMapper(object): """Mapping of files from relative path provided in the file to a tuple of From 04e0484ff875a08e66112bc04abca103979531fb Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Sun, 6 Aug 2017 23:57:13 +0530 Subject: [PATCH 5/8] using XDG standard env variable for caching --- cwltool/pathmapper.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cwltool/pathmapper.py b/cwltool/pathmapper.py index e3f91d36c..63b61dc07 100644 --- a/cwltool/pathmapper.py +++ b/cwltool/pathmapper.py @@ -147,19 +147,21 @@ def trim_listing(obj): # Download http Files def downloadHttpFile(httpurl): cache_session = None - if "HOME" in os.environ: + if "XDG_CACHE_HOME" in os.environ: cache_session = CacheControl( requests.Session(), cache=FileCache( - os.path.join(os.environ["HOME"], ".cache", "cwltool"))) - elif "TMP" in os.environ: + os.path.join(os.environ["XDG_CACHE_HOME"], "cwltool"))) + elif "HOME" in os.environ: cache_session = CacheControl( requests.Session(), - cache=FileCache(os.path.join(os.environ["TMP"], ".cache", "cwltool"))) + cache=FileCache( + os.path.join(os.environ["HOME"], ".cache", "cwltool"))) else: cache_session = CacheControl( requests.Session(), - cache=FileCache("/tmp", ".cache", "cwltool")) + cache=FileCache( + os.path.join(os.path.expanduser('~'), ".cache", "cwltool"))) r = cache_session.get(httpurl, stream=True) with NamedTemporaryFile(mode='wb', delete=False) as f: From 6a4d1f04a3ed9a939525f5c46a0fd367f8a0ae26 Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Wed, 9 Aug 2017 22:56:10 +0530 Subject: [PATCH 6/8] adding tset to check that input file is loaded over http/s --- tests/test_http_input.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/test_http_input.py diff --git a/tests/test_http_input.py b/tests/test_http_input.py new file mode 100644 index 000000000..e3a298190 --- /dev/null +++ b/tests/test_http_input.py @@ -0,0 +1,26 @@ +from __future__ import absolute_import +import unittest +import os +import tempfile +from cwltool.pathmapper import PathMapper + + +class TestHttpInput(unittest.TestCase): + def test_http_path_mapping(self): + class SubPathMapper(PathMapper): + def __init__(self, referenced_files, basedir, stagedir): + super(SubPathMapper, self).__init__(referenced_files, basedir, stagedir) + input_file_path = "https://raw.githubusercontent.com/common-workflow-language/cwltool/master/tests/2.fasta" + tempdir = tempfile.mkdtemp() + base_file = [{ + "class": "File", + "location": "https://raw.githubusercontent.com/common-workflow-language/cwltool/master/tests/2.fasta", + "basename": "chr20.fa" + }] + path_map_obj = SubPathMapper(base_file, os.getcwd(), tempdir) + + self.assertIn(input_file_path,path_map_obj._pathmap) + assert os.path.exists(path_map_obj._pathmap[input_file_path].resolved) == 1 + with open(path_map_obj._pathmap[input_file_path].resolved) as f: + self.assertIn(">Sequence 561 BP; 135 A; 106 C; 98 G; 222 T; 0 other;",f.read()) + f.close() \ No newline at end of file From be521202503574d02271fb5f02ce305f909a89d0 Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Thu, 10 Aug 2017 00:18:58 +0530 Subject: [PATCH 7/8] adding type to downloadhttp function --- cwltool/pathmapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cwltool/pathmapper.py b/cwltool/pathmapper.py index 63b61dc07..f3c5fa64f 100644 --- a/cwltool/pathmapper.py +++ b/cwltool/pathmapper.py @@ -146,6 +146,7 @@ def trim_listing(obj): # Download http Files def downloadHttpFile(httpurl): + # type: (Text) -> Text cache_session = None if "XDG_CACHE_HOME" in os.environ: cache_session = CacheControl( From 7cac3a4b8b0eb3108ccaa2841d351afacd615c77 Mon Sep 17 00:00:00 2001 From: kapilkd13 Date: Thu, 10 Aug 2017 20:59:57 +0530 Subject: [PATCH 8/8] cleanup: removing repetitive code lines --- cwltool/pathmapper.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/cwltool/pathmapper.py b/cwltool/pathmapper.py index f3c5fa64f..dd7c09ae7 100644 --- a/cwltool/pathmapper.py +++ b/cwltool/pathmapper.py @@ -149,20 +149,16 @@ def downloadHttpFile(httpurl): # type: (Text) -> Text cache_session = None if "XDG_CACHE_HOME" in os.environ: - cache_session = CacheControl( - requests.Session(), - cache=FileCache( - os.path.join(os.environ["XDG_CACHE_HOME"], "cwltool"))) + directory = os.environ["XDG_CACHE_HOME"] elif "HOME" in os.environ: - cache_session = CacheControl( - requests.Session(), - cache=FileCache( - os.path.join(os.environ["HOME"], ".cache", "cwltool"))) + directory = os.environ["HOME"] else: - cache_session = CacheControl( - requests.Session(), - cache=FileCache( - os.path.join(os.path.expanduser('~'), ".cache", "cwltool"))) + directory = os.path.expanduser('~') + + cache_session = CacheControl( + requests.Session(), + cache=FileCache( + os.path.join(directory, ".cache", "cwltool"))) r = cache_session.get(httpurl, stream=True) with NamedTemporaryFile(mode='wb', delete=False) as f: