Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ Parameters::Parameters():
PARAM_ID_MODE(PARAM_ID_MODE_ID, "--id-mode", "Database ID mode", "Select DB entries based on 0: database keys, 1: FASTA identifiers (.lookup)", typeid(int), (void *) &dbIdMode, "^[0-1]{1}$"),
PARAM_TAR_INCLUDE(PARAM_TAR_INCLUDE_ID, "--tar-include", "Tar Inclusion Regex", "Include file names based on this regex", typeid(std::string), (void *) &tarInclude, "^.*$"),
PARAM_TAR_EXCLUDE(PARAM_TAR_EXCLUDE_ID, "--tar-exclude", "Tar Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &tarExclude, "^.*$"),
PARAM_INPUT_MODE(PARAM_INPUT_MODE_ID, "--input-mode", "Input list mode", "0: only index, 1: index and range", typeid(int), (void *) &inputmode, "^[0-1]{1}$"),
// unpackdb
PARAM_UNPACK_SUFFIX(PARAM_UNPACK_SUFFIX_ID, "--unpack-suffix", "Unpack suffix", "File suffix for unpacked files.\nAdd .gz suffix to write compressed files.", typeid(std::string), (void *) &unpackSuffix, "^.*$"),
PARAM_UNPACK_NAME_MODE(PARAM_UNPACK_NAME_MODE_ID, "--unpack-name-mode", "Unpack name mode", "Name unpacked files by 0: DB key, 1: accession (through .lookup)", typeid(int), (void *) &unpackNameMode, "^[0-1]{1}$"),
Expand Down Expand Up @@ -1169,6 +1170,7 @@ Parameters::Parameters():
createsubdb.push_back(&PARAM_SUBDB_MODE);
createsubdb.push_back(&PARAM_ID_MODE);
createsubdb.push_back(&PARAM_V);
createsubdb.push_back(&PARAM_INPUT_MODE);

// renamedbkeys
renamedbkeys.push_back(&PARAM_SUBDB_MODE);
Expand Down Expand Up @@ -2623,6 +2625,7 @@ void Parameters::setDefaults() {
// createsubdb
subDbMode = Parameters::SUBDB_MODE_HARD;
dbIdMode = Parameters::ID_MODE_KEYS;
inputmode = 0;

// tar2db
tarInclude = ".*";
Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,7 @@ class Parameters {
// createsubdb
int subDbMode;
int dbIdMode;
int inputmode;

// tar2db
std::string tarInclude;
Expand Down Expand Up @@ -1065,6 +1066,7 @@ class Parameters {
// createsubdb
PARAMETER(PARAM_SUBDB_MODE)
PARAMETER(PARAM_ID_MODE)
PARAMETER(PARAM_INPUT_MODE)

// tar2db
PARAMETER(PARAM_TAR_INCLUDE)
Expand Down
55 changes: 46 additions & 9 deletions src/util/createsubdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ int createsubdb(int argc, const char **argv, const Command& command) {
Parameters& par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

bool isIndex = false;
FILE *orderFile = NULL;
if (FileUtil::fileExists(par.db1Index.c_str())) {
orderFile = fopen(par.db1Index.c_str(), "r");
isIndex = true;
} else {
if(FileUtil::fileExists(par.db1.c_str())){
orderFile = fopen(par.db1.c_str(), "r");
Expand All @@ -22,7 +24,8 @@ int createsubdb(int argc, const char **argv, const Command& command) {
EXIT(EXIT_FAILURE);
}
}

//no multithreading
unsigned int thread_idx = 0;
const bool lookupMode = par.dbIdMode == Parameters::ID_MODE_LOOKUP;
int dbMode = DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA;
if (lookupMode) {
Expand All @@ -32,16 +35,21 @@ int createsubdb(int argc, const char **argv, const Command& command) {
reader.open(DBReader<unsigned int>::NOSORT);
const bool isCompressed = reader.isCompressed();

DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE);
DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, isCompressed, Parameters::DBTYPE_OMIT_FILE);
writer.open();
// getline reallocs automatic
char *line = NULL;
size_t len = 0;
char dbKey[256];
unsigned int prevKey = 0;
bool isOrdered = true;
char* result;
char newLine = '\n';
char nullByte = '\0';
std::vector<std::string> arr;
while (getline(&line, &len, orderFile) != -1) {
Util::parseKey(line, dbKey);
arr = Util::split(line, "\t");
unsigned int key;
if (lookupMode) {
size_t lookupId = reader.getLookupIdByAccession(dbKey);
Expand All @@ -62,21 +70,51 @@ int createsubdb(int argc, const char **argv, const Command& command) {
continue;
}
if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), 0);
} else {
writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), thread_idx);
} else if (isIndex == true || arr.size() == 1 || par.inputmode == 0) {
char* data = reader.getDataUncompressed(id);
size_t originalLength = reader.getEntryLen(id);
size_t entryLength = std::max(originalLength, static_cast<size_t>(1)) - 1;

if (isCompressed) {
// copy also the null byte since it contains the information if compressed or not
entryLength = *(reinterpret_cast<unsigned int *>(data)) + sizeof(unsigned int) + 1;
writer.writeData(data, entryLength, key, 0, false, false);
writer.writeData(data, entryLength, key, thread_idx, false, false);
} else {
writer.writeData(data, entryLength, key, 0, true, false);
writer.writeData(data, entryLength, key, thread_idx, true, false);
}
// do not write null byte since
writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0);
writer.writeIndexEntry(key, writer.getStart(0), originalLength, thread_idx);
} else {
if (arr.size()%2 == 0) {
Debug(Debug::ERROR) << "Input list not in format\n";
} else {
char* data;
if (isCompressed) {
data = reader.getDataCompressed(id, thread_idx);
} else {
data = reader.getDataUncompressed(id);
}
size_t entryLength = std::max(reader.getEntryLen(id), static_cast<size_t>(1));
int totalLength = 0;
result = new char[entryLength];
for (int ord = 0 ; ord < int((arr.size()-1)/2); ord ++) {
int currLength = std::stoi(arr[ord * 2 + 2]) - std::stoi(arr[ord * 2 + 1]) + 1;
strncpy(result + totalLength, data + std::stoi(arr[ord * 2 + 1]), currLength);
totalLength += currLength;
}
result[totalLength] = newLine;
if (isCompressed) {
writer.writeData(result, totalLength + 1, key, thread_idx, true, false);
} else {
writer.writeData(result, totalLength, key, thread_idx, false, false);
writer.writeAdd(&newLine, sizeof(char), thread_idx);
writer.writeAdd(&nullByte, sizeof(char), thread_idx);
}
delete [] result;
result = nullptr;

writer.writeIndexEntry(key, writer.getStart(0), totalLength + 2, thread_idx);
}
}
}
// merge any kind of sequence database
Expand All @@ -89,7 +127,6 @@ int createsubdb(int argc, const char **argv, const Command& command) {
}
DBWriter::writeDbtypeFile(par.db3.c_str(), reader.getDbtype(), isCompressed);
DBReader<unsigned int>::softlinkDb(par.db2, par.db3, DBFiles::SEQUENCE_ANCILLARY);

free(line);
reader.close();
if (fclose(orderFile) != 0) {
Expand Down
Loading