From a5dd1ed1d8a81da8f546bf21bd2cfb1bcd2785eb Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 27 Apr 2015 16:12:37 +0100 Subject: [PATCH 1/4] Add support for S3 pseudo-URLs Rewrite S3 pseudo-URLs to http/https URLs, adding Date and Authorization headers for Amazon S3. At present, access keys may be specified in the URL (in the usual URL authority "[id:secret@]bucket" way) or via the usual AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables. It remains to add code to read them from config files -- probably just ~/.aws/credentials and ~/.awssecret. --- config.h.in | 9 +++ configure.ac | 10 +++ hfile.c | 3 + hfile_libcurl.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 209 insertions(+), 1 deletion(-) diff --git a/config.h.in b/config.h.in index 1ec9b6afe..c3862201f 100644 --- a/config.h.in +++ b/config.h.in @@ -5,9 +5,18 @@ This template file can be updated with autoheader, but do so carefully as autoheader adds #defines such as PACKAGE_* that we don't want. */ +/* Define if you have the Common Crypto library. */ +#undef HAVE_COMMONCRYPTO + +/* Define to 1 if you have the `gmtime_r' function. */ +#undef HAVE_GMTIME_R + /* Define to 1 if iRODS file access is enabled. */ #undef HAVE_IRODS +/* Define to 1 if you have the `crypto' library (-lcrypto). */ +#undef HAVE_LIBCRYPTO + /* Define if libcurl file access is enabled. */ #undef HAVE_LIBCURL diff --git a/configure.ac b/configure.ac index 4fce8a944..fb3754cf2 100644 --- a/configure.ac +++ b/configure.ac @@ -38,6 +38,8 @@ redistribute it. There is NO WARRANTY, to the extent permitted by law.]) AC_PROG_CC AC_PROG_RANLIB +need_crypto=no + AC_ARG_WITH([irods], [AS_HELP_STRING([[--with-irods[=DIR]]], [use RodsAPIs library (in DIR) to support iRODS URLs])], @@ -55,6 +57,7 @@ AC_ARG_ENABLE([libcurl], dnl FIXME This pulls in dozens of standard header checks AC_FUNC_MMAP +AC_CHECK_FUNCS(gmtime_r) save_LIBS=$LIBS zlib_devel=ok @@ -121,8 +124,15 @@ is installed. Either configure with --disable-libcurl or resolve this error to build HTSlib.]) ;; esac]) + need_crypto=yes fi AC_SUBST([libcurl]) +if test $need_crypto != no; then + AC_CHECK_LIB([crypto], [HMAC]) + AC_CHECK_FUNC([CCHmac], [AC_DEFINE([HAVE_COMMONCRYPTO], 1, + [Define if you have the Common Crypto library.])]) +fi + AC_CONFIG_FILES(config.mk) AC_OUTPUT diff --git a/hfile.c b/hfile.c index 476b5939c..160aea02e 100644 --- a/hfile.c +++ b/hfile.c @@ -552,6 +552,9 @@ hFILE *hopen(const char *fname, const char *mode) #ifdef HAVE_LIBCURL else if (strncmp(fname, "http://", 7) == 0 || strncmp(fname, "https://", 8) == 0 || + strncmp(fname, "s3://", 5) == 0 || + strncmp(fname, "s3+http://", 10) == 0 || + strncmp(fname, "s3+https://", 11) == 0 || strncmp(fname, "ftp://", 6) == 0) return hopen_libcurl(fname,mode); #endif else if (strncmp(fname, "http://", 7) == 0 || diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 9af702244..946f6ce0b 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -24,12 +24,15 @@ DEALINGS IN THE SOFTWARE. */ #include +#include #include #include +#include #include #include #include "hfile_internal.h" +#include "htslib/kstring.h" #include @@ -465,6 +468,9 @@ static int add_header(hFILE_libcurl *fp, const char *header) return 0; } +static int +add_s3_settings(hFILE_libcurl *fp, const char *url, kstring_t *message); + hFILE *hopen_libcurl(const char *url, const char *modes) { hFILE_libcurl *fp; @@ -512,7 +518,17 @@ hFILE *hopen_libcurl(const char *url, const char *modes) if (add_header(fp, "Transfer-Encoding: chunked") < 0) goto error; } - err |= curl_easy_setopt(fp->easy, CURLOPT_URL, url); + if (tolower(url[0]) == 's' && url[1] == '3') { + // Construct the HTTP-Method/Content-MD5/Content-Type part of the + // message to be signed. This will be destroyed by add_s3_settings(). + kstring_t message = { 0, 0, NULL }; + kputs((mode == 'r')? "GET\n" : "PUT\n", &message); + kputc('\n', &message); + kputc('\n', &message); + if (add_s3_settings(fp, url, &message) < 0) goto error; + } + else + err |= curl_easy_setopt(fp->easy, CURLOPT_URL, url); if (fp->headers) err |= curl_easy_setopt(fp->easy, CURLOPT_HTTPHEADER, fp->headers); @@ -557,3 +573,173 @@ hFILE *hopen_libcurl(const char *url, const char *modes) errno = save; return NULL; } + + +/******************* + * Rewrite S3 URLs * + *******************/ + +#if defined HAVE_COMMONCRYPTO + +#include + +#define DIGEST_BUFSIZ CC_SHA1_DIGEST_LENGTH + +static size_t +s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message) +{ + CCHmac(kCCHmacAlgSHA1, key->s, key->l, message->s, message->l, digest); + return CC_SHA1_DIGEST_LENGTH; +} + +#elif defined HAVE_LIBCRYPTO + +#include + +#define DIGEST_BUFSIZ EVP_MAX_MD_SIZE + +static size_t +s3_sign(unsigned char *digest, kstring_t *key, kstring_t *message) +{ + unsigned int len; + HMAC(EVP_sha1(), key->s, key->l, + (unsigned char *) message->s, message->l, digest, &len); + return len; +} + +#endif + +static void +urldecode_kput(const char *s, int len, hFILE_libcurl *fp, kstring_t *str) +{ + if (memchr(s, '%', len) != NULL) { + int len2; + char *s2 = curl_easy_unescape(fp->easy, s, len, &len2); + if (s2 == NULL) abort(); + kputsn(s2, len2, str); + curl_free(s2); + } + else kputsn(s, len, str); +} + +static void base64_kput(const unsigned char *data, size_t len, kstring_t *str) +{ + static const char base64[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + size_t i = 0; + unsigned x = 0; + int bits = 0, pad = 0; + + while (bits || i < len) { + if (bits < 6) { + x <<= 8, bits += 8; + if (i < len) x |= data[i++]; + else pad++; + } + + bits -= 6; + kputc(base64[(x >> bits) & 63], str); + } + + str->l -= pad; + kputsn("==", pad, str); +} + +static int +add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message) +{ + int ret, save; + const char *bucket, *path; + char date_hdr[40]; + CURLcode err; + + kstring_t url = { 0, 0, NULL }; + kstring_t id = { 0, 0, NULL }; + kstring_t secret = { 0, 0, NULL }; + kstring_t auth_hdr = { 0, 0, NULL }; + + time_t now = time(NULL); +#ifdef HAVE_GMTIME_R + struct tm tm_buffer; + struct tm *tm = gmtime_r(&now, &tm_buffer); +#else + struct tm *tm = gmtime(&now); +#endif + + strftime(date_hdr, sizeof date_hdr, "Date: %a, %d %b %Y %H:%M:%S GMT", tm); + if (add_header(fp, date_hdr) < 0) goto error; + kputs(&date_hdr[6], message); + kputc('\n', message); + + // Our S3 URL format is s3[+SCHEME]://[ID[:SECRET]@]BUCKET/PATH + + if (s3url[2] == '+') { + bucket = strchr(s3url, ':') + 1; + kputsn(&s3url[3], bucket - &s3url[3], &url); + } + else { + kputs("https:", &url); + bucket = &s3url[3]; + } + while (*bucket == '/') kputc(*bucket++, &url); + + path = bucket + strcspn(bucket, "/?#@"); + if (*path == '@') { + const char *colon = bucket + strcspn(bucket, ":@"); + urldecode_kput(bucket, colon - bucket, fp, &id); + if (*colon == ':') + urldecode_kput(&colon[1], path - &colon[1], fp, &secret); + + bucket = &path[1]; + path = bucket + strcspn(bucket, "/?#"); + } + else { + // If the URL has no ID[:SECRET]@, consider environment variables. + const char *v; + if ((v = getenv("AWS_ACCESS_KEY_ID")) != NULL) kputs(v, &id); + if ((v = getenv("AWS_SECRET_ACCESS_KEY")) != NULL) kputs(v, &secret); + } + + kputsn(bucket, path - bucket, &url); + kputs(".s3.amazonaws.com", &url); + kputs(path, &url); + + kputc('/', message); + kputs(bucket, message); // CanonicalizedResource is '/' + bucket + path + + err = curl_easy_setopt(fp->easy, CURLOPT_URL, url.s); + if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); goto error; } + + // TODO Read id and secret from config files + + // If we have no id/secret, we can't sign the request but will + // still be able to access public data sets. + if (id.l > 0 && secret.l > 0) { + unsigned char digest[DIGEST_BUFSIZ]; + size_t digest_len = s3_sign(digest, &secret, message); + + kputs("Authorization: AWS ", &auth_hdr); + kputs(id.s, &auth_hdr); + kputc(':', &auth_hdr); + base64_kput(digest, digest_len, &auth_hdr); + + if (add_header(fp, auth_hdr.s) < 0) goto error; + } + + ret = 0; + goto free_and_return; + +error: + ret = -1; + +free_and_return: + save = errno; + free(url.s); + free(id.s); + free(secret.s); + free(auth_hdr.s); + free(message->s); + errno = save; + return ret; +} From 87226f5fd4ff2c920cd778eccfca65fa8141c881 Mon Sep 17 00:00:00 2001 From: DonFreed Date: Fri, 26 Jun 2015 13:22:30 -0400 Subject: [PATCH 2/4] Added parsing for additional file types for hisremote. Added code to incorporate an 'AWS_SESSION_TOKEN' environmental variable. Added this token as a CononicalizedAmzHeader to the signature message and a header incorporating the token. Changed the request style from virtual hosted to path. --- hfile.c | 14 +++++++++++--- hfile_libcurl.c | 23 ++++++++++++++++++++--- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/hfile.c b/hfile.c index 160aea02e..6652eba6d 100644 --- a/hfile.c +++ b/hfile.c @@ -570,9 +570,17 @@ hFILE *hopen(const char *fname, const char *mode) int hisremote(const char *fname) { // FIXME Make a new backend entry to return this - if (strncmp(fname, "http://", 7) == 0 || - strncmp(fname, "https://", 8) == 0 || - strncmp(fname, "ftp://", 6) == 0) return 1; + if (strncmp(fname, "file://", 7) == 0) return 0; +#ifdef HAVE_LIBCURL + else if (strncmp(fname, "http://", 7) == 0 || + strncmp(fname, "https://", 8) == 0 || + strncmp(fname, "s3://", 5) == 0 || + strncmp(fname, "s3+http://", 10) == 0 || + strncmp(fname, "s3+https://", 11) == 0 || + strncmp(fname, "ftp://", 6) == 0) return 1; +#endif + else if (strncmp(fname, "http://", 7) == 0 || + strncmp(fname, "ftp://", 6) == 0) return 1; #ifdef HAVE_IRODS else if (strncmp(fname, "irods:", 6) == 0) return 1; #endif diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 946f6ce0b..810d42ba7 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -657,7 +657,9 @@ add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message) kstring_t url = { 0, 0, NULL }; kstring_t id = { 0, 0, NULL }; kstring_t secret = { 0, 0, NULL }; + kstring_t token = { 0, 0, NULL }; kstring_t auth_hdr = { 0, 0, NULL }; + kstring_t token_hdr = { 0, 0, NULL }; time_t now = time(NULL); #ifdef HAVE_GMTIME_R @@ -699,11 +701,19 @@ add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message) const char *v; if ((v = getenv("AWS_ACCESS_KEY_ID")) != NULL) kputs(v, &id); if ((v = getenv("AWS_SECRET_ACCESS_KEY")) != NULL) kputs(v, &secret); + if ((v = getenv("AWS_SESSION_TOKEN")) != NULL) kputs(v, &token); } - kputsn(bucket, path - bucket, &url); - kputs(".s3.amazonaws.com", &url); - kputs(path, &url); + // Use a path-style request + kputs("s3.amazonaws.com/", &url); + kputs(bucket, &url); + + // Add token to message as CanonicalizedAmzHeader + if (token.l > 0) { + kputs("x-amz-security-token:", message); + kputs(token.s, message); + kputc('\n', message); + } kputc('/', message); kputs(bucket, message); // CanonicalizedResource is '/' + bucket + path @@ -725,6 +735,11 @@ add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message) base64_kput(digest, digest_len, &auth_hdr); if (add_header(fp, auth_hdr.s) < 0) goto error; + if (token.l > 0) { + kputs("X-Amz-Security-Token: ", &token_hdr); + kputs(token.s, &token_hdr); + if (add_header(fp, token_hdr.s) < 0) goto error; + } } ret = 0; @@ -738,7 +753,9 @@ add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message) free(url.s); free(id.s); free(secret.s); + free(token.s); free(auth_hdr.s); + free(token_hdr.s); free(message->s); errno = save; return ret; From 1fec5f31a6c89f0b3e2da42053b2c8a58973be0d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 1 Jul 2015 10:34:11 +0100 Subject: [PATCH 3/4] Fall back to path-style S3 bucket access if necessary Check whether bucket names are DNS-compliant according to the rules at http://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html Fixes part 3 of #232, hat tip @DonFreed. --- hfile_libcurl.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 946f6ce0b..cb4110450 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -646,6 +646,23 @@ static void base64_kput(const unsigned char *data, size_t len, kstring_t *str) kputsn("==", pad, str); } +static int is_dns_compliant(const char *s0, const char *slim) +{ + int has_nondigit = 0, len = 0; + const char *s; + + for (s = s0; s < slim; len++, s++) + if (islower(*s) || *s == '-') has_nondigit = 1; + else if (isdigit(*s)) ; + else if (*s == '.') { + if (s == s0 || ! isalnum(s[-1])) return 0; + if (s+1 == slim || ! isalnum(s[1])) return 0; + } + else return 0; + + return has_nondigit && len >= 3 && len <= 63; +} + static int add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message) { @@ -701,8 +718,15 @@ add_s3_settings(hFILE_libcurl *fp, const char *s3url, kstring_t *message) if ((v = getenv("AWS_SECRET_ACCESS_KEY")) != NULL) kputs(v, &secret); } - kputsn(bucket, path - bucket, &url); - kputs(".s3.amazonaws.com", &url); + // Use virtual hosted-style access if possible, otherwise path-style. + if (is_dns_compliant(bucket, path)) { + kputsn(bucket, path - bucket, &url); + kputs(".s3.amazonaws.com", &url); + } + else { + kputs("s3.amazonaws.com/", &url); + kputsn(bucket, path - bucket, &url); + } kputs(path, &url); kputc('/', message); From b10956f6607d69ed44e3e786350c597e794b8ab7 Mon Sep 17 00:00:00 2001 From: DonFreed Date: Thu, 9 Jul 2015 18:33:03 +0000 Subject: [PATCH 4/4] Check that the bucket name does not begin or end with a hyphen. --- hfile_libcurl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 86b72cd0d..d8d1caa39 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -651,6 +651,8 @@ static int is_dns_compliant(const char *s0, const char *slim) int has_nondigit = 0, len = 0; const char *s; + if (*s0 == '-' || slim[-1] == '-') return 0; + for (s = s0; s < slim; len++, s++) if (islower(*s) || *s == '-') has_nondigit = 1; else if (isdigit(*s)) ;