From 59dc1f9c27fa0b366a89a65fc2de963f3795c3f5 Mon Sep 17 00:00:00 2001 From: ThomasHickman Date: Fri, 13 Jul 2018 10:43:27 +0100 Subject: [PATCH 1/2] Abstract ref_to_m5 from cram_io This creates a new file: `ref.c`, which contains a function `m5_to_path`. This function returns a valid path to a reference from the MD5 checksum of a reference. This replaces logic in cram_io, open_trace_file and mFILE. --- .gitignore | 1 + Makefile | 40 ++- config.mk.in | 1 + configure.ac | 12 +- cram/cram_index.c | 3 - cram/cram_io.c | 330 ++++-------------------- cram/cram_structs.h | 2 - cram/mFILE.h | 93 ------- cram/open_trace_file.h | 125 --------- hfile_internal.h | 5 + htslib.mk | 6 +- htslib/ref.h | 63 +++++ htslib_vars.mk | 1 + ref.c | 563 +++++++++++++++++++++++++++++++++++++++++ test/test-ref.c | 60 +++++ 15 files changed, 791 insertions(+), 514 deletions(-) delete mode 100644 cram/mFILE.h delete mode 100644 cram/open_trace_file.h create mode 100644 htslib/ref.h create mode 100644 ref.c create mode 100644 test/test-ref.c diff --git a/.gitignore b/.gitignore index d53f239f3..fb67caf92 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ lib*.so.* /test/test-regidx /test/test-vcf-api /test/test-vcf-sweep +/test/test-ref /test/test_view /test/thrash_threads[1-6] /test/*.tmp diff --git a/Makefile b/Makefile index ef86c83e6..3c75ef914 100644 --- a/Makefile +++ b/Makefile @@ -72,6 +72,7 @@ BUILT_TEST_PROGRAMS = \ test/hfile \ test/sam \ test/test_bgzf \ + test/test-ref \ test/test_realn \ test/test-regidx \ test/test_view \ @@ -140,6 +141,7 @@ LIBHTS_OBJS = \ multipart.o \ probaln.o \ realn.o \ + ref.o \ regidx.o \ sam.o \ synced_bcf_reader.o \ @@ -158,8 +160,6 @@ LIBHTS_OBJS = \ cram/cram_samtools.o \ cram/cram_stats.o \ cram/files.o \ - cram/mFILE.o \ - cram/open_trace_file.o \ cram/pooled_alloc.o \ cram/rANS_static.o \ cram/sam_header.o \ @@ -174,11 +174,12 @@ cram_misc_h = cram/misc.h $(cram_os_h) cram_os_h = cram/os.h $(htslib_hts_endian_h) cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h) -cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) cram/string_alloc.h cram/mFILE.h $(htslib_khash_h) -cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h) +cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) cram/string_alloc.h $(htslib_khash_h) + hfile_internal_h = hfile_internal.h $(htslib_hfile_h) $(textutils_internal_h) hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) +ref_internal_h = ref_internal.h $(htslib_hfile_h) cram/mFILE.h textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) @@ -316,18 +317,17 @@ plugin.o plugin.pico: plugin.c config.h $(hts_internal_h) $(htslib_kstring_h) probaln.o probaln.pico: probaln.c config.h $(htslib_hts_h) realn.o realn.pico: realn.c config.h $(htslib_hts_h) $(htslib_sam_h) textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(hts_internal_h) +ref.o ref.pico: ref.c config.h $(htslib_kstring_h) $(htslib_bgzf_h) $(cram_h) $(cram_io_h) $(htslib_hfile_h) cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(htslib_ref_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/files.o cram/files.pico: cram/files.c config.h $(cram_misc_h) -cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h -cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/pooled_alloc.h $(cram_misc_h) cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c config.h cram/rANS_static.h cram/rANS_byte.h cram/sam_header.o cram/sam_header.pico: cram/sam_header.c config.h $(htslib_hts_log_h) $(cram_sam_header_h) cram/string_alloc.h @@ -354,7 +354,7 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl # # If using MSYS, avoid poor shell expansion via: # MSYS2_ARG_CONV_EXCL="*" make check -check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) +check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) test_ebi_fetch test/hts_endian test/fieldarith test/fieldarith.sam test/hfile @@ -364,6 +364,26 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) test/test-regidx cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} +test_ebi_fetch: + @if [ "$(ENABLE_HTTP_TESTS)" = true ] ; then \ + export REF_CACHE=`mktemp -d` && \ + echo "test ref: fetching" && \ + test/test-ref && \ + echo "test ref: fetch from cache" && \ + test/test-ref && \ + rm -rf $$REF_CACHE; \ + export REF_CACHE=`mktemp -d` && \ + test/test_view -t test/xx.fa -S -C test/xx#rg.sam > test/xx#rg.tmp.cram && \ + echo "test view: fetching" && \ + test/test_view -D test/xx#rg.tmp.cram > /dev/null && \ + echo "test view: fetch from cache" && \ + test/test_view -D test/xx#rg.tmp.cram > /dev/null && \ + rm -rf $$REF_CACHE; \ + else \ + echo "Warning: testing of EBI fetching disabled"; \ + fi + + test/hts_endian: test/hts_endian.o $(CC) $(LDFLAGS) -o $@ test/hts_endian.o $(LIBS) @@ -379,6 +399,9 @@ test/sam: test/sam.o libhts.a test/test_bgzf: test/test_bgzf.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread +test/test-ref: test/test-ref.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test-ref.o libhts.a $(LIBS) -lpthread + test/test_realn: test/test_realn.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread @@ -406,6 +429,7 @@ test/hfile.o: test/hfile.c config.h $(htslib_hfile_h) $(htslib_hts_defs_h) test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) test/test-realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) +test/test-ref.o: test/test-ref.c $(htslib_bgzf_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) diff --git a/config.mk.in b/config.mk.in index 55da9c019..6b7e1c6dd 100644 --- a/config.mk.in +++ b/config.mk.in @@ -47,6 +47,7 @@ LIBS = @LIBS@ PLATFORM = @PLATFORM@ PLUGIN_EXT = @PLUGIN_EXT@ +ENABLE_HTTP_TESTS = @ENABLE_HTTP_TESTS@ # Lowercase here indicates these are "local" to config.mk plugin_OBJS = diff --git a/configure.ac b/configure.ac index d53bf9935..d96f9c78e 100644 --- a/configure.ac +++ b/configure.ac @@ -136,6 +136,11 @@ AC_ARG_ENABLE([s3], [support Amazon AWS S3 URLs])], [], [enable_s3=check]) +AC_ARG_ENABLE([http-tests], + [AS_HELP_STRING([--disable-http-tests], + [disables tests that require querying to a http server])], + [disable_http_tests=yes], [disable_http_tests=no]) + test -n "$host_alias" || host_alias=unknown-`uname -s` AC_MSG_CHECKING([shared library type for $host_alias]) case $host_alias in @@ -293,10 +298,14 @@ Either configure with --without-libdeflate or resolve this error to build HTSlib.])])])]) libcurl=disabled +ENABLE_HTTP_TESTS=false if test "$enable_libcurl" != no; then AC_CHECK_LIB([curl], [curl_easy_pause], [AC_DEFINE([HAVE_LIBCURL], 1, [Define if libcurl file access is enabled.]) - libcurl=enabled], + libcurl=enabled + if test "$disable_http_tests" = no; then + ENABLE_HTTP_TESTS=true + fi ], [AC_CHECK_LIB([curl], [curl_easy_init], [message="library is too old (7.18+ required)"], [message="library not found"]) @@ -322,6 +331,7 @@ dnl -lcurl is only needed for static linking if hfile_libcurl is not a plugin fi fi AC_SUBST([libcurl]) +AC_SUBST([ENABLE_HTTP_TESTS]) gcs=disabled if test "$enable_gcs" != no; then diff --git a/cram/cram_index.c b/cram/cram_index.c index 1a52b5701..43c925893 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -588,9 +588,6 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { BGZF *fp; kstring_t fn_idx_str = {0}; - // Useful for cram_index_build_multiref - cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR); - if (! fn_idx) { kputs(fn_base, &fn_idx_str); kputs(".crai", &fn_idx_str); diff --git a/cram/cram_io.c b/cram/cram_io.c index 6bbffc9e9..f8376a2ba 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -77,7 +77,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cram/cram.h" #include "cram/os.h" #include "htslib/hts.h" -#include "cram/open_trace_file.h" +#include "htslib/ref.h" #include "cram/rANS_static.h" //#define REF_DEBUG @@ -1500,13 +1500,10 @@ char *cram_content_type2str(enum cram_content_type t) { * Frees/unmaps a reference sequence and associated file handles. */ static void ref_entry_free_seq(ref_entry *e) { - if (e->mf) - mfclose(e->mf); - if (e->seq && !e->mf) + if (e->seq) free(e->seq); e->seq = NULL; - e->mf = NULL; } void refs_free(refs_t *r) { @@ -1709,7 +1706,6 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { e->count = 0; e->seq = NULL; - e->mf = NULL; e->is_md5 = 0; k = kh_put(refs, r->h_meta, e->name, &n); @@ -1907,242 +1903,45 @@ int cram_set_header(cram_fd *fd, SAM_hdr *hdr) { } /* - * Converts a directory and a filename into an expanded path, replacing %s - * in directory with the filename and %[0-9]+s with portions of the filename - * Any remaining parts of filename are added to the end with /%s. - */ -int expand_cache_path(char *path, char *dir, char *fn) { - char *cp, *start = path; - size_t len; - size_t sz = PATH_MAX; - - while ((cp = strchr(dir, '%'))) { - if (cp-dir >= sz) return -1; - strncpy(path, dir, cp-dir); - path += cp-dir; - sz -= cp-dir; - - if (*++cp == 's') { - len = strlen(fn); - if (len >= sz) return -1; - strcpy(path, fn); - path += len; - sz -= len; - fn += len; - cp++; - } else if (*cp >= '0' && *cp <= '9') { - char *endp; - long l; - - l = strtol(cp, &endp, 10); - l = MIN(l, strlen(fn)); - if (*endp == 's') { - if (l >= sz) return -1; - strncpy(path, fn, l); - path += l; - fn += l; - sz -= l; - *path = 0; - cp = endp+1; - } else { - if (sz < 3) return -1; - *path++ = '%'; - *path++ = *cp++; - } - } else { - if (sz < 3) return -1; - *path++ = '%'; - *path++ = *cp++; - } - dir = cp; - } - - len = strlen(dir); - if (len >= sz) return -1; - strcpy(path, dir); - path += len; - sz -= len; - - len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0); - if (len >= sz) return -1; - if (*fn && path > start && path[-1] != '/') - *path++ = '/'; - strcpy(path, fn); - return 0; -} - -/* - * Make the directory containing path and any prefix directories. - */ -void mkdir_prefix(char *path, int mode) { - char *cp = strrchr(path, '/'); - if (!cp) - return; - - *cp = 0; - if (is_directory(path)) { - *cp = '/'; - return; - } - - if (mkdir(path, mode) == 0) { - chmod(path, mode); - *cp = '/'; - return; - } - - mkdir_prefix(path, mode); - mkdir(path, mode); - chmod(path, mode); - *cp = '/'; -} - -/* - * Return the cache directory to use, based on the first of these - * environment variables to be set to a non-empty value. - */ -static const char *get_cache_basedir(const char **extra) { - char *base; - - *extra = ""; - - base = getenv("XDG_CACHE_HOME"); - if (base && *base) return base; - - base = getenv("HOME"); - if (base && *base) { *extra = "/.cache"; return base; } - - base = getenv("TMPDIR"); - if (base && *base) return base; - - base = getenv("TEMP"); - if (base && *base) return base; - - return "/tmp"; -} - -/* - * Return an integer representation of pthread_self(). - */ -static unsigned get_int_threadid() { - pthread_t pt = pthread_self(); - unsigned char *s = (unsigned char *) &pt; - size_t i; - unsigned h = 0; - for (i = 0; i < sizeof(pthread_t); i++) - h = (h << 5) - h + s[i]; - return h; -} - -/* - * Queries the M5 string from the header and attempts to populate the + * Queries the M5 string from the header and attempt to populate the * reference from this using the REF_PATH environment. * - * Returns 0 on sucess + * Returns 0 on success * -1 on failure */ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { - char *ref_path = getenv("REF_PATH"); SAM_hdr_type *ty; SAM_hdr_tag *tag; - char path[PATH_MAX], path_tmp[PATH_MAX + 64]; - char cache[PATH_MAX], cache_root[PATH_MAX]; - char *local_cache = getenv("REF_CACHE"); - mFILE *mf; - int local_path = 0; hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id); - cache_root[0] = '\0'; - - if (!ref_path || *ref_path == '\0') { - /* - * If we have no ref path, we use the EBI server. - * However to avoid spamming it we require a local ref cache too. - */ - ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s"; - if (!local_cache || *local_cache == '\0') { - const char *extra; - const char *base = get_cache_basedir(&extra); - snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra); - snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); - local_cache = cache; - hts_log_info("Populating local cache: %s", local_cache); - } - } - if (!r->name) return -1; if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name))) return -1; - if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL))) - goto no_M5; - - hts_log_info("Querying ref %s", tag->str+3); - - /* Use cache if available */ - if (local_cache && *local_cache) { - if (expand_cache_path(path, local_cache, tag->str+3) == 0) - local_path = 1; - } + int no_m5 = 0; + char* ref_fn; -#ifndef HAVE_MMAP - char *path2; - /* Search local files in REF_PATH; we can open them and return as above */ - if (!local_path && (path2 = find_path(tag->str+3, ref_path))) { - strncpy(path, path2, PATH_MAX); - free(path2); - if (is_file(path)) // incase it's too long - local_path = 1; + if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL))){ + no_m5 = 1; } -#endif - - /* Found via REF_CACHE or local REF_PATH file */ - if (local_path) { - struct stat sb; - BGZF *fp; - - if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) { - r->length = sb.st_size; - r->offset = r->line_length = r->bases_per_line = 0; + else { + const char* m5_str = tag->str+3; + hts_log_info("Querying ref %s", m5_str); - r->fn = string_dup(fd->refs->pool, path); - - if (fd->refs->fp) - if (bgzf_close(fd->refs->fp) != 0) - return -1; - fd->refs->fp = fp; - fd->refs->fn = r->fn; - r->is_md5 = 1; - - // Fall back to cram_get_ref() where it'll do the actual - // reading of the file. - return 0; + if (!(ref_fn = m5_to_path(m5_str))){ + no_m5 = 1; } } - - /* Otherwise search full REF_PATH; slower as loads entire file */ - if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) { - size_t sz; - r->seq = mfsteal(mf, &sz); - if (r->seq) { - r->mf = NULL; - } else { - // keep mf around as we couldn't detach - r->seq = mf->data; - r->mf = mf; - } - r->length = sz; - r->is_md5 = 1; - } else { + if (no_m5) { + // Couldn't get reference using the M5 (no M5 field, not in search path + // and not in the M5 cache), see if querying the @SQ UR: tag works refs_t *refs; char *fn; - no_M5: - /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL))) return -1; @@ -2178,76 +1977,52 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { return 0; } - /* Populate the local disk cache if required */ - if (local_cache && *local_cache) { - int pid = (int) getpid(); - unsigned thrid = get_int_threadid(); - hFILE *fp; + r->is_md5 = 1; - if (*cache_root && !is_directory(cache_root)) { - hts_log_warning("Creating reference cache directory %s\n" - "This may become large; see the samtools(1) manual page REF_CACHE discussion", - cache_root); - } + if (!strncmp(ref_fn, "http:", 5) || !strncmp(ref_fn, "ftp:", 4)) { + const int READ_LENGTH = 8193; + /* Read the whole sequence into memory, as we're dealing with a remote file */ + kstring_t ref_seq = {0}; - if (expand_cache_path(path, local_cache, tag->str+3) < 0) { - return 0; // Not fatal - we have the data already so keep going. - } - hts_log_info("Writing cache file '%s'", path); - mkdir_prefix(path, 01777); - - do { - // Attempt to further uniquify the temporary filename - unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); - thrid++; // Ensure filename changes even if time/clock haven't - - snprintf(path_tmp, sizeof(path_tmp), "%s.tmp_%d_%u_%u", - path, pid, thrid, t); - fp = hopen(path_tmp, "wx"); - } while (fp == NULL && errno == EEXIST); - if (!fp) { - perror(path_tmp); - - // Not fatal - we have the data already so keep going. - return 0; - } + hFILE* ref_hfile = hopen(ref_fn, "r"); - // Check md5sum - hts_md5_context *md5; - char unsigned md5_buf1[16]; - char md5_buf2[33]; + do{ + ks_resize(&ref_seq, ks_len(&ref_seq) + READ_LENGTH); + } while (hread(ref_hfile, ks_str(&ref_seq) + ks_len(&ref_seq), READ_LENGTH) > 0); - if (!(md5 = hts_md5_init())) { - hclose_abruptly(fp); - unlink(path_tmp); + if(hclose(ref_hfile) != 0){ + free(ref_fn); return -1; } - hts_md5_update(md5, r->seq, r->length); - hts_md5_final(md5_buf1, md5); - hts_md5_destroy(md5); - hts_md5_hex(md5_buf2, md5_buf1); - - if (strncmp(tag->str+3, md5_buf2, 32) != 0) { - hts_log_error("Mismatching md5sum for downloaded reference"); - hclose_abruptly(fp); - unlink(path_tmp); + + r->length = ks_len(&ref_seq); + r->seq = ks_str(&ref_seq); + } + else { + struct stat st; + if(stat(ref_fn, &st) != 0){ + free(ref_fn); return -1; } + r->length = st.st_size; - if (hwrite(fp, r->seq, r->length) != r->length) { - perror(path); - } - if (hclose(fp) < 0) { - unlink(path_tmp); - } else { - if (0 == chmod(path_tmp, 0444)) - rename(path_tmp, path); - else - unlink(path_tmp); - } + r->offset = r->line_length = r->bases_per_line = 0; + + fd->refs->fn = r->fn = string_dup(fd->refs->pool, ref_fn); + + if (fd->refs->fp) + if (bgzf_close(fd->refs->fp) != 0){ + free(ref_fn); + return -1; + } + + fd->refs->fp = bgzf_open_ref(ref_fn, "r", 1); } + free(ref_fn); + return 0; + } static void cram_ref_incr_locked(refs_t *r, int id) { @@ -2423,7 +2198,6 @@ ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) { RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1)); e->seq = seq; - e->mf = NULL; e->count++; /* @@ -2514,8 +2288,8 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { * an on-disk filename for it. * * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls - * open_path_mfile and libcurl, which isn't multi-thread safe unless I - * rewrite my code to have one curl handle per thread. + * libcurl, which isn't multi-thread safe unless I rewrite my code to + * have one curl handle per thread. */ pthread_mutex_lock(&fd->refs->lock); if (r->length == 0) { diff --git a/cram/cram_structs.h b/cram/cram_structs.h index ec60ebb88..e6719dfd5 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -52,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "htslib/thread_pool.h" #include "cram/string_alloc.h" -#include "cram/mFILE.h" #include "htslib/khash.h" #ifdef __cplusplus @@ -608,7 +607,6 @@ typedef struct ref_entry { int line_length; int64_t count; // for shared references so we know to dealloc seq char *seq; - mFILE *mf; int is_md5; // Reference comes from a raw seq found by MD5 } ref_entry; diff --git a/cram/mFILE.h b/cram/mFILE.h deleted file mode 100644 index b0463c765..000000000 --- a/cram/mFILE.h +++ /dev/null @@ -1,93 +0,0 @@ -/* -Copyright (c) 2005-2006, 2008-2009 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifndef _MFILE_H_ -#define _MFILE_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct { - FILE *fp; - char *data; - size_t alloced; - int eof; - int mode; /* open mode in MF_?? define bit pattern */ - size_t size; - size_t offset; - size_t flush_pos; -} mFILE; - -// Work around a clash with winuser.h -#ifdef MF_APPEND -# undef MF_APPEND -#endif - -#define MF_READ 1 -#define MF_WRITE 2 -#define MF_APPEND 4 -#define MF_BINARY 8 -#define MF_TRUNC 16 -#define MF_MODEX 32 -#define MF_MMAP 64 - -mFILE *mfreopen(const char *path, const char *mode, FILE *fp); -mFILE *mfopen(const char *path, const char *mode); -int mfdetach(mFILE *mf); -int mfclose(mFILE *mf); -int mfdestroy(mFILE *mf); -int mfseek(mFILE *mf, long offset, int whence); -long mftell(mFILE *mf); -void mrewind(mFILE *mf); -void mftruncate(mFILE *mf, long offset); -int mfeof(mFILE *mf); -size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf); -size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf); -int mfgetc(mFILE *mf); -int mungetc(int c, mFILE *mf); -mFILE *mfcreate(char *data, int size); -mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp); -void mfrecreate(mFILE *mf, char *data, int size); -void *mfsteal(mFILE *mf, size_t *size_out); -char *mfgets(char *s, int size, mFILE *mf); -int mfflush(mFILE *mf); -mFILE *mstdin(void); -mFILE *mstdout(void); -mFILE *mstderr(void); -void mfascii(mFILE *mf); - -#ifdef __cplusplus -} -#endif - -#endif /* _MFILE_H_ */ diff --git a/cram/open_trace_file.h b/cram/open_trace_file.h deleted file mode 100644 index 7e1f29fd9..000000000 --- a/cram/open_trace_file.h +++ /dev/null @@ -1,125 +0,0 @@ -/* -Author: James Bonfield - -Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL -All rights reserved - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - . Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - . Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - . Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF -MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or -promote products derived from this software without specific prior written -permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/* -Copyright (c) 2008, 2009, 2013 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifndef _OPEN_TRACE_FILE_H_ -#define _OPEN_TRACE_FILE_H_ - -#include "cram/mFILE.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Tokenises the search path splitting on colons (unix) or semicolons - * (windows). - * We also explicitly add a "./" to the end of the search path - * - * Returns: A new search path with items separated by nul chars. Two nul - * chars in a row represent the end of the tokenised path. - * Returns NULL for a failure. - * - * The returned data has been malloced. It is up to the caller to free this - * memory. - */ -char *tokenise_search_path(char *searchpath); - -/* - * Opens a trace file named 'file'. This is initially looked for as a - * pathname relative to a file named "relative_to". This may (for - * example) be the name of an experiment file referencing the trace - * file. In this case by passing relative_to as the experiment file - * filename the trace file will be picked up in the same directory as - * the experiment file. Relative_to may be supplied as NULL. - * - * 'file' is looked for at relative_to, then the current directory, and then - * all of the locations listed in 'path' (which is a colon separated list). - * If 'path' is NULL it uses the RAWDATA environment variable instead. - * - * Returns a mFILE pointer when found. - * NULL otherwise. - */ -mFILE *open_path_mfile(char *file, char *path, char *relative_to); - -/* - * Returns a mFILE containing the entire contents of the url; - * NULL on failure. - */ -mFILE *find_file_url(char *file, char *url); - - -/* - * As per open_path_mfile, but searching only for local filenames. - * This is useful as we may avoid doing a full mfopen and loading - * the entire file into memory. - * - * Returns the expanded pathname if found. - * NULL if not - */ -char *find_path(char *file, char *path); - -#ifdef __cplusplus -} -#endif - -#endif /* _OPEN_TRACE_FILE_H_ */ diff --git a/hfile_internal.h b/hfile_internal.h index c243f8e77..4e4648cc5 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -91,6 +91,11 @@ int hfile_oflags(const char *mode); or 0 for a default-sized buffer. */ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity); +/* Initialise a hfile wrapper - a wrapper that redirects all public calls + to another hFILE. This can be used so that you can contain extra information + about a hFILE in a wrapper hFILE */ +hFILE *hfile_init_wrapper(size_t struct_size, hFILE* wrapper); + /* Alternative to hfile_init() for in-memory backends for which the base buffer is the only storage. Buffer is already allocated via malloc(2) of size buf_size and with buf_filled bytes already filled. Ownership diff --git a/htslib.mk b/htslib.mk index eb8f38a77..79ed6e1f8 100644 --- a/htslib.mk +++ b/htslib.mk @@ -66,6 +66,7 @@ HTSLIB_PUBLIC_HEADERS = \ $(HTSDIR)/htslib/ksort.h \ $(HTSDIR)/htslib/kstring.h \ $(HTSDIR)/htslib/regidx.h \ + $(HTSDIR)/htslib/ref.h \ $(HTSDIR)/htslib/sam.h \ $(HTSDIR)/htslib/synced_bcf_reader.h \ $(HTSDIR)/htslib/tbx.h \ @@ -100,6 +101,7 @@ HTSLIB_ALL = \ $(HTSDIR)/probaln.c \ $(HTSDIR)/realn.c \ $(HTSDIR)/regidx.c \ + $(HTSDIR)/ref.c \ $(HTSDIR)/sam.c \ $(HTSDIR)/synced_bcf_reader.c \ $(HTSDIR)/tbx.c \ @@ -128,11 +130,7 @@ HTSLIB_ALL = \ $(HTSDIR)/cram/cram_stats.h \ $(HTSDIR)/cram/cram_structs.h \ $(HTSDIR)/cram/files.c \ - $(HTSDIR)/cram/mFILE.c \ - $(HTSDIR)/cram/mFILE.h \ $(HTSDIR)/cram/misc.h \ - $(HTSDIR)/cram/open_trace_file.c \ - $(HTSDIR)/cram/open_trace_file.h \ $(HTSDIR)/cram/os.h \ $(HTSDIR)/cram/pooled_alloc.c \ $(HTSDIR)/cram/pooled_alloc.h \ diff --git a/htslib/ref.h b/htslib/ref.h new file mode 100644 index 000000000..14bd7c3b5 --- /dev/null +++ b/htslib/ref.h @@ -0,0 +1,63 @@ +/// @file ref.h +/// Reference genome fetching +/* + Copyright (C) 2017-2018 Genome Research Ltd. + + Author: Thomas Hickman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + + +#ifndef HTSLIB_REF_H +#define HTSLIB_REF_H + +#include "htslib/hfile.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* m5_to_ref() - returns the reference genome that has a given MD5 string + * @param m5_str: The m5 string to query + * @returns: A hFILE containing a file pointer to a reference genome. + * NULL on failure + * + * Note: This function is not currently thread safe, so locks + * need to be acquired before calling this, in a multi-threaded + * enviroment + */ +hFILE* m5_to_ref(const char *m5_str); + +/* m5_to_path() - returns a path to a reference genome that has a + * given MD5 string + * @param m5_str: The m5 string to query + * @returns: A path to the correct reference genome. + * NULL on failure + * + * Note: This function is not currently thread safe, so locks + * need to be acquired before calling this, in a multi-threaded + * enviroment + */ +char* m5_to_path(const char *m5_str); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/htslib_vars.mk b/htslib_vars.mk index 97928473e..a10a009b0 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -44,6 +44,7 @@ htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h +htslib_ref_h = $(HTSPREFIX)htslib/ref.h htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h) htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h) diff --git a/ref.c b/ref.c new file mode 100644 index 000000000..675c26ce8 --- /dev/null +++ b/ref.c @@ -0,0 +1,563 @@ +/* +Copyright (c) 2008-2009, 2012-2018 Genome Research Ltd. +Authors: James Bonfield , Thomas Hickman + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* +Author: James Bonfield + +Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL +All rights reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF +MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or +promote products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "htslib/kstring.h" + +#include "cram/cram.h" +#include "cram/cram_io.h" +#include "cram/os.h" +#include "cram/misc.h" +#include "htslib/ref.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef PATH_MAX +#define PATH_MAX FILENAME_MAX // was 1024 in open_trace_file +#endif + + +/* + * Return the cache directory to use, based on the first of these + * environment variables to be set to a non-empty value. + */ +static const char *get_cache_basedir(const char **extra) +{ + char *base; + + *extra = ""; + + base = getenv("XDG_CACHE_HOME"); + if (base && *base) + return base; + + base = getenv("HOME"); + if (base && *base) + { + *extra = "/.cache"; + return base; + } + + base = getenv("TMPDIR"); + if (base && *base) + return base; + + base = getenv("TEMP"); + if (base && *base) + return base; + + return "/tmp"; +} + +/* + * Converts a directory and a filename into an expanded path, replacing %s + * in directory with the filename and %[0-9]+s with portions of the filename + * Any remaining parts of filename are added to the end with /%s. + */ +static char* expand_path(char *dir, char *fn) +{ + char *cp; + char *out = malloc(strlen(dir) + strlen(fn) + 1); + + char *out_writer = out; + + while ((cp = strchr(dir, '%'))) + { + strncpy(out_writer, dir, cp - dir); + out_writer += cp - dir; + + if (*++cp == 's') + { + strcpy(out_writer, fn); + out_writer += strlen(fn); + fn += strlen(fn); + cp++; + } + else if (*cp >= '0' && *cp <= '9') + { + char *endp; + long l; + + l = strtol(cp, &endp, 10); + l = MIN(l, strlen(fn)); + if (*endp == 's') + { + strncpy(out_writer, fn, l); + out_writer += l; + fn += l; + *out_writer = 0; + cp = endp + 1; + } + else + { + *out_writer++ = '%'; + *out_writer++ = *cp++; + } + } + else + { + *out_writer++ = '%'; + *out_writer++ = *cp++; + } + dir = cp; + } + strcpy(out_writer, dir); + out_writer += strlen(dir); + if (*fn && out_writer[-1] != '/') + *out_writer++ = '/'; + strcpy(out_writer, fn); + + return out; +} + +/* + * Like expand_path, but doesn't use %[0-9]+s rules. + */ +static char* expand_path_basic(char *dir, char *fn) +{ + char *cp; + char *out = malloc(strlen(dir) + strlen(fn) + 1); + + char *out_writer = out; + + while ((cp = strchr(dir, '%'))) + { + strncpy(out_writer, dir, cp - dir); + out_writer += cp - dir; + + if (*++cp == 's') + { + strcpy(out_writer, fn); + out_writer += strlen(fn); + fn += strlen(fn); + cp++; + } + dir = cp; + } + strcpy(out_writer, dir); + out_writer += strlen(dir); + if (*fn && out_writer[-1] != '/') + *out_writer++ = '/'; + strcpy(out_writer, fn); + + return out; +} + +/* + * Return an integer representation of pthread_self(). + */ +static unsigned get_int_threadid() +{ + pthread_t pt = pthread_self(); + unsigned char *s = (unsigned char *)&pt; + size_t i; + unsigned h = 0; + for (i = 0; i < sizeof(pthread_t); i++) + h = (h << 5) - h + s[i]; + return h; +} + +/* + * Make the directory containing path and any prefix directories. + */ +static void mkdir_prefix(char *path, int mode) +{ + char *cp = strrchr(path, '/'); + if (!cp) + return; + + *cp = 0; + if (is_directory(path)) + { + *cp = '/'; + return; + } + + if (mkdir(path, mode) == 0) + { + chmod(path, mode); + *cp = '/'; + return; + } + + mkdir_prefix(path, mode); + mkdir(path, mode); + chmod(path, mode); + *cp = '/'; +} + +#define READ_CHUNK_SIZE 8192 + +/* + * Tokenises the search path splitting on colons (unix) or semicolons + * (windows). + * We also explicitly add a "./" to the end of the search path + * + * Returns: A new search path with items separated by nul chars. Two nul + * chars in a row represent the end of the tokenised path. + * Returns NULL for a failure. + * + * The returned data has been malloced. It is up to the caller to free this + * memory. + */ +static char *tokenise_search_path(char *searchpath) +{ + char *newsearch; + unsigned int i, j; + size_t len; +#ifdef _WIN32 + char path_sep = ';'; +#else + char path_sep = ':'; +#endif + + if (!searchpath) + searchpath = ""; + + newsearch = (char *)malloc((len = strlen(searchpath)) + 5); + if (!newsearch) + return NULL; + + for (i = 0, j = 0; i < len; i++) + { + /* "::" => ":". Used for escaping colons in http://foo */ + if (i < len - 1 && searchpath[i] == ':' && searchpath[i + 1] == ':') + { + newsearch[j++] = ':'; + i++; + continue; + } + + /* Handle http:// and ftp:// too without :: */ + if (path_sep == ':') + { + if ((i == 0 || (i > 0 && searchpath[i - 1] == ':')) && + (!strncmp(&searchpath[i], "http:", 5) || + !strncmp(&searchpath[i], "https:", 6) || + !strncmp(&searchpath[i], "ftp:", 4) || + !strncmp(&searchpath[i], "|http:", 6) || + !strncmp(&searchpath[i], "|https:", 7) || + !strncmp(&searchpath[i], "|ftp:", 5) || + !strncmp(&searchpath[i], "URL=http:", 9) || + !strncmp(&searchpath[i], "URL=https:", 10)|| + !strncmp(&searchpath[i], "URL=ftp:", 8))) + { + do + { + newsearch[j++] = searchpath[i]; + } while (i < len && searchpath[i++] != ':'); + if (searchpath[i] == ':') + i++; + if (searchpath[i] == '/') + newsearch[j++] = searchpath[i++]; + if (searchpath[i] == '/') + newsearch[j++] = searchpath[i++]; + // Look for host:port + do + { + newsearch[j++] = searchpath[i++]; + } while (i < len && searchpath[i] != ':' && searchpath[i] != '/'); + newsearch[j++] = searchpath[i++]; + if (searchpath[i] == ':') + i++; + } + } + + if (searchpath[i] == path_sep) + { + /* Skip blank path components */ + if (j && newsearch[j - 1] != 0) + newsearch[j++] = 0; + } + else + { + newsearch[j++] = searchpath[i]; + } + } + + if (j) + newsearch[j++] = 0; + newsearch[j++] = '.'; + newsearch[j++] = '/'; + newsearch[j++] = 0; + newsearch[j++] = 0; + + return newsearch; +} + +/* + * Looks in a colon (in non-windows enviroments) or semi-colon (windows) separated + * list (path) to find a file. + * + * Any path can contain %s subtitutions and filesystem subsitutions can contain %Ns + * like subsitutions + * + * Returns a hFILE pointer when found. + * NULL otherwise. + */ +static char* resolve_file_in_path(char *file, char *path) +{ + char *newsearch; + char *proposed_path; + char* resolved_path; + hFILE *fp; + + if (NULL == (newsearch = tokenise_search_path(path))) + return NULL; + + for (proposed_path = newsearch; *proposed_path; proposed_path += strlen(proposed_path) + 1) + { + if (strncmp(proposed_path, "URL=", 4) == 0) + resolved_path = expand_path_basic(proposed_path+4, file); + else if (!strncmp(proposed_path, "http:", 5) || + !strncmp(proposed_path, "https:", 5) || + !strncmp(proposed_path, "ftp:", 4)) + resolved_path = expand_path_basic(proposed_path, file); + else + resolved_path = expand_path(proposed_path, file); + + // Does the file exist? Use the hFILE logic to find by opening the file, then closing it. + if((fp = hopen(resolved_path, "r"))) + { + hclose_abruptly(fp); + free(newsearch); + + return resolved_path; + } + + free(resolved_path); + } + + free(newsearch); + return NULL; +} + +// Public functions + +char* m5_to_path(const char *m5_str){ + char *ref_path = getenv("REF_PATH"); + char *ref_cache = getenv("REF_CACHE"); + char path_tmp[PATH_MAX]; + char cache[PATH_MAX], cache_root[PATH_MAX]; + + cache_root[0] = '\0'; + + if (!ref_path || *ref_path == '\0') { + /* + * If we have no ref path, we use the EBI server. + * However to avoid spamming it we require a local ref cache too. + */ + + ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s"; + if (!ref_cache || *ref_cache == '\0') { + const char *extra; + const char *base = get_cache_basedir(&extra); + snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra); + snprintf(cache, PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); + ref_cache = cache; + hts_log_info("Populating local cache: %s", ref_cache); + } + } + + /* Try in REF_CACHE */ + if (ref_cache && *ref_cache) { + struct stat sb; + char* found_path = expand_path(ref_cache, (char *)m5_str); + + if(0 == stat(found_path, &sb)){ + return found_path; + } + } + + char* found_path; + /* Try in REF_PATH */ + if (!(found_path = resolve_file_in_path((char*)m5_str, ref_path))) { + hts_log_info("Failed to fetch file. REF_PATH: '%s', M5: '%s'", ref_path, m5_str); + return NULL; + } + + /* If the REF_CACHE enviromental variable is set, populate the cache. */ + if (ref_cache && *ref_cache) { + hFILE* ref; + if(!(ref = hopen(found_path, "r"))){ + return NULL; + } + + int pid = (int)getpid(); + unsigned thrid = get_int_threadid(); + hFILE *fp; + + if (*cache_root && !is_directory(cache_root)) { + hts_log_warning("Creating reference cache directory %s\n" + "This may become large; see the samtools(1) manual page REF_CACHE discussion", + cache_root); + } + + char* cache_path = expand_path(ref_cache, (char *)m5_str); + hts_log_info("Writing cache file '%s'", cache_path); + mkdir_prefix(cache_path, 01777); + + do { + // Attempt to further uniquify the temporary filename + unsigned t = ((unsigned)time(NULL)) ^ ((unsigned)clock()); + thrid++; // Ensure filename changes even if time/clock haven't + + sprintf(path_tmp, "%s.tmp_%d_%u_%u", cache_path, pid, thrid, t); + fp = hopen(path_tmp, "wx"); + } while (fp == NULL && errno == EEXIST); + if (!fp) { + perror(path_tmp); + free(cache_path); + + // Doesn't matter if we can't write to the temp file, just return the non + // cached path. This argument is used many times below. + return found_path; + } + + // Stream the file into the cache and check the md5 + hts_md5_context *md5; + char unsigned md5_buf1[16]; + char md5_buf2[33]; + + if (!(md5 = hts_md5_init())) { + hclose_abruptly(fp); + unlink(path_tmp); + free(cache_path); + hts_log_error("Function hts_md5_init failed"); + + return found_path; + } + + int read_length; + char buf[READ_CHUNK_SIZE]; + + while ((read_length = hread(ref, buf, READ_CHUNK_SIZE)) > 0) { + hts_md5_update(md5, buf, read_length); + if(hwrite(fp, buf, read_length) != read_length){ + perror(cache_path); + hclose_abruptly(ref); + free(cache_path); + + return found_path; + } + } + + hts_md5_final(md5_buf1, md5); + hts_md5_destroy(md5); + hts_md5_hex(md5_buf2, md5_buf1); + + if (strncmp(m5_str, md5_buf2, 32) != 0) { + hclose_abruptly(fp); + free(found_path); + free(cache_path); + unlink(path_tmp); + + hts_log_error("Mismatching md5sum for downloaded reference"); + return NULL; + } + + if (hclose(fp) < 0) + { + perror(cache_path); + unlink(path_tmp); + free(cache_path); + + return found_path; + } + else + { + if (0 == chmod(path_tmp, 0444)) + rename(path_tmp, cache_path); + else{ + perror(cache_path); + unlink(path_tmp); + free(cache_path); + + return found_path; + } + } + + free(found_path); + return cache_path; + } + + return found_path; +} + +hFILE* m5_to_ref(const char *m5_str){ + char* m5_path; + + if (!(m5_path = m5_to_path(m5_str))) + return NULL; + + hFILE* hf = hopen(m5_path, "r"); + free(m5_path); + + return hf; +} diff --git a/test/test-ref.c b/test/test-ref.c new file mode 100644 index 000000000..decb17849 --- /dev/null +++ b/test/test-ref.c @@ -0,0 +1,60 @@ +/* test/test-ref.c -- ref unit tests + + Copyright (C) 2017 Genome Research Ltd + + Author: Thomas Hickman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +#include "htslib/ref.h" +#include "htslib/bgzf.h" +#include "htslib/hfile.h" + +#include +#include +#include + +int main(int argc, char **argv) { + const char* m5_str = "bbf4de6d8497a119dda6e074521643dc"; + + int error_code = EXIT_SUCCESS; + + hFILE* ref; + + if (!(ref = m5_to_ref(m5_str))){ + fprintf(stderr, "Error in m5_to_ref\n"); + return EXIT_FAILURE; + } + + char buf[100]; + + size_t size_read = hread(ref, buf, 100); + if(size_read <= 0){ + fprintf(stderr, "Invalid hfile size read\n"); + return EXIT_FAILURE; + } + + if(hclose(ref) != 0){ + fprintf(stderr, "Cannot close hfile\n"); + return EXIT_FAILURE; + } + + return error_code; +} From 55e9668a6ef53a600613627dcac4fe756a7fd239 Mon Sep 17 00:00:00 2001 From: ThomasHickman Date: Fri, 13 Jul 2018 10:31:55 +0100 Subject: [PATCH 2/2] Add error messages for failed reference lookups This adds functionality implemented in b39e724. --- cram/cram_io.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index f8376a2ba..654d64257 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1932,6 +1932,8 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { hts_log_info("Querying ref %s", m5_str); if (!(ref_fn = m5_to_path(m5_str))){ + hts_log_warning("Failed to find reference with MD5 \"%s\".", m5_str); + no_m5 = 1; } } @@ -1954,8 +1956,11 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { return -1; fd->refs->fp = NULL; } - if (!(refs = refs_load_fai(fd->refs, fn, 0))) + if (!(refs = refs_load_fai(fd->refs, fn, 0))){ + hts_log_warning("Failed to find reference \"%s\" from the @SQ UR: tag.", fn); + return -1; + } sanitise_SQ_lines(fd); fd->refs = refs; @@ -1984,7 +1989,12 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { /* Read the whole sequence into memory, as we're dealing with a remote file */ kstring_t ref_seq = {0}; - hFILE* ref_hfile = hopen(ref_fn, "r"); + hFILE* ref_hfile; + if (!(ref_hfile = hopen(ref_fn, "r"))){ + hts_log_error("Failed to open reference \"%s\": %s", ref_fn, strerror(errno)); + + return -1; + } do{ ks_resize(&ref_seq, ks_len(&ref_seq) + READ_LENGTH); @@ -2022,7 +2032,6 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { free(ref_fn); return 0; - } static void cram_ref_incr_locked(refs_t *r, int id) {