Skip to content

Commit 97d469e

Browse files
authored
Merge pull request #396 from AdamaJava/qsig_bed_file
feat(qsignature): add blocklist option that takes a BED file and ignores positions contained therein
2 parents 28ea931 + 419c2df commit 97d469e

File tree

5 files changed

+566
-20
lines changed

5 files changed

+566
-20
lines changed

qsignature/src/org/qcmg/sig/Compare.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.commons.math3.util.Pair;
2828
import org.qcmg.common.log.QLogger;
2929
import org.qcmg.common.log.QLoggerFactory;
30+
import org.qcmg.common.model.PositionRange;
3031
import org.qcmg.common.util.Constants;
3132
import org.qcmg.common.util.FileUtils;
3233
import org.qcmg.sig.model.Comparison;
@@ -82,7 +83,20 @@ public class Compare {
8283

8384
private final ConcurrentMap<File, Pair<SigMeta,TIntByteHashMap>> cache = new ConcurrentHashMap<>();
8485

86+
private final Map<String, List<PositionRange>> blockPositions = new THashMap<>();
87+
private String blocklist;
88+
8589
private int engage() throws Exception {
90+
91+
if (null != blocklist) {
92+
SignatureUtil.loadBlockListIntoMap(blocklist, blockPositions);
93+
int totalEntries = blockPositions.values().stream()
94+
.mapToInt(List::size)
95+
.sum();
96+
97+
logger.info("Total blocked positions: " + totalEntries);
98+
99+
}
86100

87101
// get excludes
88102
logger.info("Retrieving excludes list from: " + excludeVcfsFile);
@@ -197,8 +211,9 @@ Pair<SigMeta, TIntByteHashMap> getSignatureData(File f) throws IOException {
197211
// if not - load
198212
Pair<SigMeta, TIntByteHashMap> result = cache.get(f);
199213
if (result == null) {
200-
Pair<SigMeta, TMap<String, TIntByteHashMap>> rgResults = SignatureUtil.loadSignatureGenotype(f, minimumCoverage, minimumRGCoverage, homCutoff, upperHetCutoff, lowerHetCutoff);
201-
214+
// Pair<SigMeta, TMap<String, TIntByteHashMap>> rgResults = SignatureUtil.loadSignatureGenotype(f, minimumCoverage, minimumRGCoverage, homCutoff, upperHetCutoff, lowerHetCutoff);
215+
Pair<SigMeta, TMap<String, TIntByteHashMap>> rgResults = SignatureUtil.loadSignatureGenotype(f, minimumCoverage, minimumRGCoverage, homCutoff, upperHetCutoff, lowerHetCutoff, blockPositions);
216+
202217

203218
/*
204219
* deal with empty file scenario first
@@ -423,10 +438,14 @@ protected int setup(String args[]) throws Exception{
423438

424439
additionalSearchStrings = options.getAdditionalSearchString();
425440
logger.tool("Setting additionalSearchStrings to: " + Arrays.deepToString(additionalSearchStrings));
441+
442+
blocklist = options.getBlocklist();
443+
if (null != blocklist) {
444+
logger.tool("Blocklist: " + blocklist);
445+
}
426446

427447
if (options.hasExcludeVcfsFileOption())
428448
excludeVcfsFile = options.getExcludeVcfsFile();
429-
430449
logger.logInitialExecutionStats("Compare", Compare.class.getPackage().getImplementationVersion(), args);
431450

432451
return engage();

qsignature/src/org/qcmg/sig/Options.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ final class Options {
123123
.describedAs("emailSubject");
124124
parser.accepts("excludeVcfsFile", "file containing a list of vcf files to ignore in the comparison").withRequiredArg().ofType(String.class)
125125
.describedAs("excludeVcfsFile");
126+
parser.accepts("blocklist", "BED file containing a list of genomic position to ignore in the comparison").withRequiredArg().ofType(String.class)
127+
.describedAs("BED file containing regions to ignore");
126128
parser.accepts("validation", VALIDATION_STRINGENCY_OPTION_DESCRIPTION).withRequiredArg().ofType(String.class);
127129
parser.accepts("stream", STREAM_OPTION_DESCRIPTION);
128130
parser.acceptsAll(asList("p", "position"), "File containing a list of positions that will be examined. Must be a subset of the positions in the snpPositions file").withRequiredArg().ofType(String.class).describedAs("position");
@@ -296,6 +298,9 @@ public boolean hasExcludeVcfsFileOption() {
296298
public String getExcludeVcfsFile() {
297299
return (String) options.valueOf("excludeVcfsFile");
298300
}
301+
public String getBlocklist() {
302+
return (String) options.valueOf("blocklist");
303+
}
299304
public Optional<String> getGeneModelFile() {
300305
return Optional.ofNullable((String) options.valueOf("geneModel"));
301306
}

qsignature/src/org/qcmg/sig/util/SignatureUtil.java

Lines changed: 69 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import static java.util.Comparator.comparing;
1010

11+
import java.io.BufferedReader;
1112
import java.io.File;
1213
import java.io.IOException;
1314
import java.nio.charset.StandardCharsets;
@@ -39,6 +40,7 @@
3940
import org.apache.commons.math3.util.Pair;
4041
import org.qcmg.common.log.QLogger;
4142
import org.qcmg.common.log.QLoggerFactory;
43+
import org.qcmg.common.model.PositionRange;
4244
import org.qcmg.common.string.StringUtils;
4345
import org.qcmg.common.util.BaseUtils;
4446
import org.qcmg.common.util.ChrPositionCache;
@@ -336,11 +338,14 @@ public static float getVAF(int[] counts, String ref) {
336338
};
337339
}
338340

339-
public static Pair<SigMeta, TMap<String, TIntByteHashMap>> loadSignatureGenotype(File file, int minCoverage, int minRGCoverage) throws IOException {
340-
return loadSignatureGenotype(file, minCoverage, minRGCoverage, HOM_CUTOFF, HET_UPPER_CUTOFF, HET_LOWER_CUTOFF);
341+
public static Pair<SigMeta, TMap<String, TIntByteHashMap>> loadSignatureGenotype(File file, int minCoverage, int minRGCoverage) throws IOException {
342+
return loadSignatureGenotype(file, minCoverage, minRGCoverage, HOM_CUTOFF, HET_UPPER_CUTOFF, HET_LOWER_CUTOFF, null);
343+
}
344+
public static Pair<SigMeta, TMap<String, TIntByteHashMap>> loadSignatureGenotype(File file, int minCoverage, int minRGCoverage, Map<String, List<PositionRange>> blockedPositions) throws IOException {
345+
return loadSignatureGenotype(file, minCoverage, minRGCoverage, HOM_CUTOFF, HET_UPPER_CUTOFF, HET_LOWER_CUTOFF, blockedPositions);
341346
}
342347

343-
public static Pair<SigMeta, TMap<String, TIntByteHashMap>> loadSignatureGenotype(File file, int minCoverage, int minRGCoverage, float homCutoff, float upperHetCutoff, float lowerHetCutoff) throws IOException {
348+
public static Pair<SigMeta, TMap<String, TIntByteHashMap>> loadSignatureGenotype(File file, int minCoverage, int minRGCoverage, float homCutoff, float upperHetCutoff, float lowerHetCutoff, Map<String, List<PositionRange>> blockedPositions) throws IOException {
344349
if (null == file) {
345350
throw new IllegalArgumentException("Null file object passed to loadSignatureGenotype");
346351
}
@@ -361,16 +366,55 @@ public static Pair<SigMeta, TMap<String, TIntByteHashMap>> loadSignatureGenotype
361366
}
362367

363368
if (null != sm && sm.isValid()) {
364-
getDataFromBespokeLayout(file, minCoverage, minRGCoverage, ratios, rgRatios, rgIds, reader, homCutoff, upperHetCutoff, lowerHetCutoff);
369+
getDataFromBespokeLayout(file, minCoverage, minRGCoverage, ratios, rgRatios, rgIds, reader, homCutoff, upperHetCutoff, lowerHetCutoff, blockedPositions);
365370
} else {
366371
rgRatios.put("all", loadSignatureRatiosFloatGenotypeNew(file, MINIMUM_COVERAGE, homCutoff, upperHetCutoff, lowerHetCutoff));
367372
}
368373
}
369374
return new Pair<>(sm, rgRatios);
370375
}
376+
377+
public static void loadBlockListIntoMap(String blocklistFile, Map<String, List<PositionRange>> map) {
378+
try {
379+
// Use buffered reading with larger buffer for better I/O performance
380+
try (BufferedReader reader = Files.newBufferedReader(Paths.get(blocklistFile), StandardCharsets.UTF_8)) {
381+
382+
String line;
383+
while ((line = reader.readLine()) != null) {
384+
// Skip comments and empty lines early
385+
if (line.isEmpty() || line.charAt(0) == '#') continue;
386+
387+
// Use indexOf instead of split for better performance
388+
int firstTab = line.indexOf('\t');
389+
if (firstTab == -1) continue;
390+
391+
int secondTab = line.indexOf('\t', firstTab + 1);
392+
if (secondTab == -1) continue;
393+
394+
// Check if there's a third tab (tokens.length >= 3 equivalent)
395+
int thirdTab = line.indexOf('\t', secondTab + 1);
396+
if (thirdTab == -1 && secondTab == line.length() - 1) continue;
397+
398+
try {
399+
String contig = line.substring(0, firstTab);
400+
int start = Integer.parseInt(line, firstTab + 1, secondTab, 10);
401+
int stop = Integer.parseInt(line, secondTab + 1,
402+
thirdTab == -1 ? line.length() : thirdTab, 10);
403+
404+
map.computeIfAbsent(contig, v -> new ArrayList<>()).add(new PositionRange(start, stop));
405+
} catch (NumberFormatException e) {
406+
// Skip malformed lines silently or log if needed
407+
logger.debug("Skipping malformed line: " + line);
408+
}
409+
}
410+
}
411+
} catch (IOException e) {
412+
logger.error("Error reading blocklist file: " + blocklistFile, e);
413+
}
414+
}
371415

372416
public static void getDataFromBespokeLayout(File file, int minCoverage, int minRGCoverage, TIntByteHashMap ratios,
373-
TMap<String, TIntByteHashMap> rgRatios, Map<String, String> rgIds, StringFileReader reader, float homCutoff, float upperHetCutoff, float lowerHetCutoff) {
417+
TMap<String, TIntByteHashMap> rgRatios, Map<String, String> rgIds, StringFileReader reader, float homCutoff, float upperHetCutoff, float lowerHetCutoff, Map<String, List<PositionRange>> blockedPositions) {
374418
int noOfRGs = rgIds.size();
375419
logger.debug("Number of rgs for " + file.getAbsolutePath() + " is " + noOfRGs);
376420

@@ -386,9 +430,26 @@ public static void getDataFromBespokeLayout(File file, int minCoverage, int minR
386430

387431
String coverage = line.substring(line.lastIndexOf(Constants.TAB_STRING));
388432
String chrPosString = line.substring(0, line.indexOf(Constants.TAB_STRING, line.indexOf(Constants.TAB_STRING) + 1));
389-
390-
391-
/*
433+
434+
if (null != blockedPositions) {
435+
/*
436+
get chr and position
437+
*/
438+
int tabIndex = chrPosString.indexOf(Constants.TAB);
439+
String chr = chrPosString.substring(0, tabIndex);
440+
List<PositionRange> list = blockedPositions.get(chr);
441+
if (null != list) {
442+
int pos = Integer.parseInt(chrPosString, tabIndex + 1, chrPosString.length(), 10);
443+
boolean blocked = list.stream().anyMatch(r -> r.containsPosition(pos));
444+
if (blocked) {
445+
logger.debug("Found blocked position for " + chrPosString);
446+
continue;
447+
}
448+
}
449+
}
450+
451+
452+
/*
392453
* This should be in the QAF=t:5-0-0-0,rg4:2-0-0-0,rg1:1-0-0-0,rg2:2-0-0-0 format
393454
* Need to tease out the pertinent bits
394455
*/
@@ -402,7 +463,6 @@ public static void getDataFromBespokeLayout(File file, int minCoverage, int minR
402463

403464
if (isCodedGenotypeValid(genotype1)) {
404465
cachePosition.set(ChrPositionCache.getStringIndex(chrPosString));
405-
406466
ratios.put(cachePosition.get(), genotype1);
407467
/*
408468
* Get rg data if we have more than 1 rg

qsignature/test/org/qcmg/sig/CompareTest.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,8 @@ public void nonEmptyAndEmptyInputFiles() throws Exception {
6060
writeVcfFile(f1);
6161
writeVcfFileHeader(f2);
6262
writeVcfFile(f3);
63-
6463
Executor exec = execute("--log " + logF.getAbsolutePath() + " -d " + f1.getParent() + " -o " + o.getAbsolutePath());
65-
assertEquals(0, exec.getErrCode()); // all ok
64+
assertEquals(0, exec.getErrCode()); // all ok
6665
assertTrue(o.exists());
6766
List<String> outputData = Files.readAllLines(Paths.get(o.getAbsolutePath()));
6867
assertEquals(14, outputData.size()); // 13 lines means 3 comparison

0 commit comments

Comments
 (0)