Skip to content

Commit 3af49b4

Browse files
committed
Added mmap support for references.
This greatly reduces memory usage when many jobs are running on the same machine as the references are then shared between processes.
1 parent f83dfd2 commit 3af49b4

File tree

5 files changed

+96
-14
lines changed

5 files changed

+96
-14
lines changed

cram/cram_io.c

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1462,6 +1462,19 @@ int paranoid_fclose(FILE *fp) {
14621462
* track the number of callers interested in any specific reference.
14631463
*/
14641464

1465+
/*
1466+
* Frees/unmaps a reference sequence and associated file handles.
1467+
*/
1468+
static void ref_entry_free_seq(ref_entry *e) {
1469+
if (e->mf)
1470+
mfclose(e->mf);
1471+
if (e->seq && !e->mf)
1472+
free(e->seq);
1473+
1474+
e->seq = NULL;
1475+
e->mf = NULL;
1476+
}
1477+
14651478
void refs_free(refs_t *r) {
14661479
RP("refs_free()\n");
14671480

@@ -1484,8 +1497,7 @@ void refs_free(refs_t *r) {
14841497
continue;
14851498
if (!(e = kh_val(r->h_meta, k)))
14861499
continue;
1487-
if (e->seq)
1488-
free(e->seq);
1500+
ref_entry_free_seq(e);
14891501
free(e);
14901502
}
14911503

@@ -1660,6 +1672,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
16601672

16611673
e->count = 0;
16621674
e->seq = NULL;
1675+
e->mf = NULL;
16631676

16641677
k = kh_put(refs, r->h_meta, e->name, &n);
16651678
if (-1 == n) {
@@ -2027,6 +2040,13 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
20272040
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
20282041
size_t sz;
20292042
r->seq = mfsteal(mf, &sz);
2043+
if (r->seq) {
2044+
r->mf = NULL;
2045+
} else {
2046+
// keep mf around as we couldn't detach
2047+
r->seq = mf->data;
2048+
r->mf = mf;
2049+
}
20302050
r->length = sz;
20312051
} else {
20322052
refs_t *refs;
@@ -2141,8 +2161,7 @@ static void cram_ref_decr_locked(refs_t *r, int id) {
21412161
r->ref_id[r->last_id]->seq) {
21422162
RP("%d FREE REF %d (%p)\n", gettid(),
21432163
r->last_id, r->ref_id[r->last_id]->seq);
2144-
free(r->ref_id[r->last_id]->seq);
2145-
r->ref_id[r->last_id]->seq = NULL;
2164+
ref_entry_free_seq(r->ref_id[r->last_id]);
21462165
r->ref_id[r->last_id]->length = 0;
21472166
}
21482167
}
@@ -2257,10 +2276,8 @@ ref_entry *cram_ref_load(refs_t *r, int id) {
22572276
assert(r->last->count > 0);
22582277
if (--r->last->count <= 0) {
22592278
RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
2260-
if (r->last->seq) {
2261-
free(r->last->seq);
2262-
r->last->seq = NULL;
2263-
}
2279+
if (r->last->seq)
2280+
ref_entry_free_seq(r->last);
22642281
}
22652282
}
22662283

@@ -2284,6 +2301,7 @@ ref_entry *cram_ref_load(refs_t *r, int id) {
22842301

22852302
RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
22862303
e->seq = seq;
2304+
e->mf = NULL;
22872305
e->count++;
22882306

22892307
/*

cram/cram_structs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5050

5151
#include "cram/thread_pool.h"
5252
#include "cram/string_alloc.h"
53+
#include "cram/mFILE.h"
5354
#include "htslib/khash.h"
5455

5556
#ifdef __cplusplus
@@ -585,6 +586,7 @@ typedef struct ref_entry {
585586
int line_length;
586587
int64_t count; // for shared references so we know to dealloc seq
587588
char *seq;
589+
mFILE *mf;
588590
} ref_entry;
589591

590592
KHASH_MAP_INIT_STR(refs, ref_entry*)

cram/mFILE.c

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4747
#include "cram/mFILE.h"
4848
#include "cram/vlen.h"
4949

50+
#include <sys/mman.h>
51+
5052
/*
5153
* This file contains memory-based versions of the most commonly used
5254
* (by io_lib) stdio functions.
@@ -106,6 +108,33 @@ static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) {
106108
return data;
107109
}
108110

111+
112+
#ifdef HAVE_MMAP
113+
/*
114+
* mmaps in the file, but only for reading currently.
115+
*
116+
* Returns 0 on success
117+
* -1 on failure
118+
*/
119+
int mfmmap(mFILE *mf, FILE *fp, const char *fn) {
120+
struct stat sb;
121+
122+
if (stat(fn, &sb) != 0)
123+
return -1;
124+
125+
mf->size = sb.st_size;
126+
mf->data = mmap(NULL, mf->size, PROT_READ, MAP_SHARED,
127+
fileno(fp), 0);
128+
129+
if (!mf->data)
130+
return -1;
131+
132+
mf->alloced = 0;
133+
return 0;
134+
}
135+
#endif
136+
137+
109138
/*
110139
* Creates and returns m_channel[0].
111140
* We initialise this on the first attempted read, which then slurps in
@@ -233,6 +262,8 @@ mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) {
233262
* w = write on close
234263
* a = position at end of buffer
235264
* x = position at same location as the original fp, don't seek on flush
265+
* + = for update (read and write)
266+
* m = mmap (read only)
236267
*/
237268
if (strchr(mode_str, 'r'))
238269
r = 1, mode |= MF_READ;
@@ -249,15 +280,29 @@ mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) {
249280
if (a)
250281
r = 1;
251282
}
283+
#ifdef HAVE_MMAP
284+
if (strchr(mode_str, 'm'))
285+
if (!w) mode |= MF_MMAP;
286+
#endif
252287

253288
if (r) {
254289
mf = mfcreate(NULL, 0);
255290
if (NULL == mf) return NULL;
256291
if (!(mode & MF_TRUNC)) {
257-
mf->data = mfload(fp, path, &mf->size, b);
258-
mf->alloced = mf->size;
259-
if (!a)
260-
fseek(fp, 0, SEEK_SET);
292+
#ifdef HAVE_MMAP
293+
if (mode & MF_MMAP) {
294+
if (mfmmap(mf, fp, path) == -1) {
295+
mf->data = NULL;
296+
mode &= ~MF_MMAP;
297+
}
298+
}
299+
#endif
300+
if (!mf->data) {
301+
mf->data = mfload(fp, path, &mf->size, b);
302+
mf->alloced = mf->size;
303+
if (!a)
304+
fseek(fp, 0, SEEK_SET);
305+
}
261306
}
262307
} else if (w) {
263308
/* Write - initialise the data structures */
@@ -307,6 +352,14 @@ int mfclose(mFILE *mf) {
307352

308353
mfflush(mf);
309354

355+
#ifdef HAVE_MMAP
356+
if ((mf->mode & MF_MMAP) && mf->data) {
357+
/* Mmaped */
358+
munmap(mf->data, mf->size);
359+
mf->data = NULL;
360+
}
361+
#endif
362+
310363
if (mf->fp)
311364
fclose(mf->fp);
312365

@@ -318,12 +371,16 @@ int mfclose(mFILE *mf) {
318371
/*
319372
* Closes the file pointer contained within the mFILE without destroying
320373
* the in-memory data.
374+
*
375+
* Attempting to do this on an mmaped buffer is an error.
321376
*/
322377
int mfdetach(mFILE *mf) {
323378
if (!mf)
324379
return -1;
325380

326381
mfflush(mf);
382+
if (mf->mode & MF_MMAP)
383+
return -1;
327384

328385
if (mf->fp) {
329386
fclose(mf->fp);
@@ -352,6 +409,8 @@ int mfdestroy(mFILE *mf) {
352409
* It is up to the caller to free the stolen buffer. If size_out is
353410
* not NULL, mf->size will be stored in it.
354411
* This is more-or-less the opposite of mfcreate().
412+
*
413+
* Note, we cannot steal the allocated buffer from an mmaped mFILE.
355414
*/
356415

357416
void *mfsteal(mFILE *mf, size_t *size_out) {
@@ -363,7 +422,9 @@ void *mfsteal(mFILE *mf, size_t *size_out) {
363422

364423
if (NULL != size_out) *size_out = mf->size;
365424

366-
mfdetach(mf);
425+
if (mfdetach(mf) != 0)
426+
return NULL;
427+
367428
mf->data = NULL;
368429
mfdestroy(mf);
369430

cram/mFILE.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ typedef struct {
5454
#define MF_BINARY 8
5555
#define MF_TRUNC 16
5656
#define MF_MODEX 32
57+
#define MF_MMAP 64
5758

5859
mFILE *mfreopen(const char *path, const char *mode, FILE *fp);
5960
mFILE *mfopen(const char *path, const char *mode);

cram/open_trace_file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ static mFILE *find_file_dir(char *file, char *dirname) {
259259
}
260260

261261
if (is_file(path)) {
262-
return mfopen(path, "rb");
262+
return mfopen(path, "rbm");
263263
}
264264

265265
return NULL;

0 commit comments

Comments
 (0)