diff --git a/lib/htsjdk-V1.0.0-fork.jar b/lib/htsjdk-V1.0.0-fork.jar new file mode 100644 index 000000000..deb209cfe Binary files /dev/null and b/lib/htsjdk-V1.0.0-fork.jar differ diff --git a/qmule/build.gradle b/qmule/build.gradle index 5cefcc450..e700e80e8 100644 --- a/qmule/build.gradle +++ b/qmule/build.gradle @@ -9,7 +9,6 @@ dependencies { implementation project(':qcommon') implementation project(':qio') implementation project(':qpicard') -// implementation 'com.google.code.findbugs:findbugs:1.3.9' implementation 'net.sf.jopt-simple:jopt-simple:5.0.4' implementation 'com.github.broadinstitute:picard:3.1.1' implementation 'org.broadinstitute:barclay:5.0.0' diff --git a/qmule/src/org/qcmg/qmule/AsyncCRAMReader.java b/qmule/src/org/qcmg/qmule/AsyncCRAMReader.java new file mode 100644 index 000000000..15e925f49 --- /dev/null +++ b/qmule/src/org/qcmg/qmule/AsyncCRAMReader.java @@ -0,0 +1,82 @@ +package org.qcmg.qmule; + +import htsjdk.samtools.*; +import htsjdk.samtools.util.CloseableIterator; +import org.qcmg.common.log.QLogger; +import org.qcmg.common.log.QLoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.AbstractQueue; +import java.util.concurrent.*; + +public class AsyncCRAMReader { + + private static final QLogger logger = QLoggerFactory.getLogger(AsyncCRAMReader.class); + + public static final int CHECK_SIZE_INTERVAL = 2000000; + + private final ExecutorService executor; + private final SamReader samReader; + private final AbstractQueue recordQueue; + private final int queueCapacity; + private volatile boolean isIteratorEmpty = false; + + public AsyncCRAMReader(InputStream cramStream, File referenceFile, int queueCapacity) { + this.executor = Executors.newSingleThreadExecutor(); + this.samReader = SamReaderFactory.makeDefault() + .referenceSequence(referenceFile) + .validationStringency(ValidationStringency.DEFAULT_STRINGENCY) + .open(SamInputResource.of(cramStream)); + this.recordQueue = new ConcurrentLinkedQueue<>(); + this.queueCapacity = queueCapacity; + startReading(); + } + + private void startReading() { + logger.info("Start reading"); + executor.submit(() -> { + try (CloseableIterator iterator = samReader.iterator()) { + int i = 0; + int j = 1; + while (iterator.hasNext()) { + SAMRecord record = iterator.next(); + recordQueue.add(record); + if (++i >= CHECK_SIZE_INTERVAL) { + logger.info("Processed " + (i * j) + " records"); + if (recordQueue.size() >= queueCapacity) { + logger.info("Queue is full, waiting for it to be emptied"); + while (recordQueue.size() >= queueCapacity) { + logger.info("Queue is full, waiting for it to be emptied: " + recordQueue.size()); + Thread.sleep(100); + } + } + i = 0; + j++; + } + } + logger.info("iterator is empty, processed " + (j * CHECK_SIZE_INTERVAL + i) + " records"); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } finally { + executor.shutdown(); + isIteratorEmpty = true; + logger.info("iterator is empty (in finally)"); + } + }); + } + + public SAMRecord take() throws InterruptedException { + return recordQueue.poll(); + } + + public boolean isIteratorEmpty() { + return isIteratorEmpty && recordQueue.isEmpty(); + } + + public void close() throws IOException { + samReader.close(); + executor.shutdownNow(); + } +} \ No newline at end of file diff --git a/qmule/src/org/qcmg/qmule/SamFileValidator.java b/qmule/src/org/qcmg/qmule/SamFileValidator.java new file mode 100644 index 000000000..b034ba991 --- /dev/null +++ b/qmule/src/org/qcmg/qmule/SamFileValidator.java @@ -0,0 +1,974 @@ +/* + * The MIT License + * + * Copyright (c) 2009-2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package org.qcmg.qmule; + +import htsjdk.samtools.*; +import htsjdk.samtools.SAMValidationError.Type; +import htsjdk.samtools.BamIndexValidator.IndexValidationStringency; +import htsjdk.samtools.metrics.MetricBase; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileWalker; +import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.FastqQualityFormat; +import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.ProgressLogger; +import htsjdk.samtools.util.QualityEncodingDetector; +import htsjdk.samtools.util.SequenceUtil; +import htsjdk.samtools.util.StringUtil; + +import java.io.*; +import java.nio.file.Path; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Validates SAM files as follows: + * + * + * @author Doug Voet + * @see SAMRecord#isValid() + */ +public class SamFileValidator { + + private final static Log log = Log.getInstance(SamFileValidator.class); + + private final PrintWriter out; + private Histogram errorsByType; + private PairEndInfoMap pairEndInfoByName; + private ReferenceSequenceFileWalker refFileWalker; + private SAMSequenceDictionary samSequenceDictionary; + private boolean verbose; + private int maxVerboseOutput; + private SAMSortOrderChecker orderChecker; + private Set errorsToIgnore; + private boolean ignoreWarnings; + private boolean skipMateValidation; + private boolean bisulfiteSequenced; + private IndexValidationStringency indexValidationStringency; + private boolean sequenceDictionaryEmptyAndNoWarningEmitted; + private int numWarnings; + private int numErrors; + + private final int maxTempFiles; + private int qualityNotStoredErrorCount = 0; + public static final int MAX_QUALITY_NOT_STORED_ERRORS = 100; + + public SamFileValidator(final PrintWriter out, final int maxTempFiles) { + this.out = out; + this.maxTempFiles = maxTempFiles; + this.errorsByType = new Histogram<>(); + this.refFileWalker = null; + this.maxVerboseOutput = 100; + this.indexValidationStringency = IndexValidationStringency.NONE; + this.errorsToIgnore = EnumSet.noneOf(Type.class); + this.verbose = false; + this.ignoreWarnings = false; + this.skipMateValidation = false; + this.bisulfiteSequenced = false; + this.sequenceDictionaryEmptyAndNoWarningEmitted = false; + this.numWarnings = 0; + this.numErrors = 0; + } + + Histogram getErrorsByType() { + return errorsByType; + } + + /** + * Sets one or more error types that should not be reported on. + */ + public void setErrorsToIgnore(final Collection types) { + if (!types.isEmpty()) { + this.errorsToIgnore = EnumSet.copyOf(types); + } + } + + public void setIgnoreWarnings(final boolean ignoreWarnings) { + this.ignoreWarnings = ignoreWarnings; + } + + /** + * Sets whether or not we should run mate validation beyond the mate cigar check, which + * is useful in extreme edge cases that would require a lot of memory to do the validation. + * + * @param skipMateValidation should this tool skip mate validation + */ + public void setSkipMateValidation(final boolean skipMateValidation) { + this.skipMateValidation = skipMateValidation; + } + + /** + * @return true if the validator will skip mate validation, otherwise false + */ + public boolean getSkipMateValidation() { + return skipMateValidation; + } + + /** + * Outputs validation summary report to out. + * + * @param samReader records to validate + * @param reference if null, NM tag validation is skipped + * @return boolean true if there are no validation errors, otherwise false + */ + public boolean validateSamFileSummary(final SamReader samReader, final ReferenceSequenceFile reference, Path inputFilePath, File REFERENCE_SEQUENCE) throws FileNotFoundException { + init(reference, samReader.getFileHeader()); + + validateSamFile(samReader, out, inputFilePath, REFERENCE_SEQUENCE); + + boolean result = errorsByType.isEmpty(); + + if (errorsByType.getCount() > 0) { + // Convert to a histogram with String IDs so that WARNING: or ERROR: can be prepended to the error type. + final Histogram errorsAndWarningsByType = new Histogram<>("Error Type", "Count"); + for (final Histogram.Bin bin : errorsByType.values()) { + errorsAndWarningsByType.increment(bin.getId().getHistogramString(), bin.getValue()); + } + final MetricsFile metricsFile = new MetricsFile<>(); + errorsByType.setBinLabel("Error Type"); + errorsByType.setValueLabel("Count"); + metricsFile.setHistogram(errorsAndWarningsByType); + metricsFile.write(out); + } + cleanup(); + return result; + } + + /** + * Outputs validation error details to out. + * + * @param samReader records to validate + * @param reference if null, NM tag validation is skipped + * processing will stop after this threshold has been reached + * @param REFERENCE_SEQUENCE + * @return boolean true if there are no validation errors, otherwise false + */ + public boolean validateSamFileVerbose(final SamReader samReader, final ReferenceSequenceFile reference, Path inputFilePath, File REFERENCE_SEQUENCE) { + init(reference, samReader.getFileHeader()); + + try { + validateSamFile(samReader, out, inputFilePath, REFERENCE_SEQUENCE); + } catch (MaxOutputExceededException e) { + out.println("Maximum output of [" + maxVerboseOutput + "] errors reached."); + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + final boolean result = errorsByType.isEmpty(); + cleanup(); + return result; + } + + public void validateBamFileTermination(final Path inputFile) { + try { + if (!IOUtil.isBlockCompressed(inputFile)) { + return; + } + final BlockCompressedInputStream.FileTermination terminationState = + BlockCompressedInputStream.checkTermination(inputFile); + if (terminationState.equals(BlockCompressedInputStream.FileTermination.DEFECTIVE)) { + addError(new SAMValidationError(Type.TRUNCATED_FILE, "BAM file has defective last gzip block", + inputFile.toUri().toString())); + } else if (terminationState.equals(BlockCompressedInputStream.FileTermination.HAS_HEALTHY_LAST_BLOCK)) { + addError(new SAMValidationError(Type.BAM_FILE_MISSING_TERMINATOR_BLOCK, + "Older BAM file -- does not have terminator block", + inputFile.toUri().toString())); + } + } catch (IOException e) { + throw new SAMException("IOException", e); + } + } + + private void validateSamFile(final SamReader samReader, final PrintWriter out, Path inputFilePath, final File ref) throws FileNotFoundException { + try { + validateHeader(samReader.getFileHeader()); + orderChecker = new SAMSortOrderChecker(samReader.getFileHeader().getSortOrder()); + + AsyncCRAMReader reader = new AsyncCRAMReader(new FileInputStream(inputFilePath.toFile()), ref, 1000000); + + validateSamRecordsAndQualityFormat(reader, samReader.getFileHeader()); + validateUnmatchedPairs(); + if (indexValidationStringency != IndexValidationStringency.NONE) { + try { + if (indexValidationStringency == IndexValidationStringency.LESS_EXHAUSTIVE) { + BamIndexValidator.lessExhaustivelyTestIndex(samReader); + } else { + BamIndexValidator.exhaustivelyTestIndex(samReader); + } + } catch (Exception e) { + addError(new SAMValidationError(Type.INVALID_INDEX_FILE_POINTER, e.getMessage(), null)); + } + } + + if (errorsByType.isEmpty()) { + out.println("No errors found"); + } + } finally { + out.flush(); + } + } + + /** + * Report on reads marked as paired, for which the mate was not found. + */ + private void validateUnmatchedPairs() { + final InMemoryPairEndInfoMap inMemoryPairMap; + if (pairEndInfoByName instanceof CoordinateSortedPairEndInfoMap) { + // For the coordinate-sorted map, need to detect mate pairs in which the mateReferenceIndex on one end + // does not match the readReference index on the other end, so the pairs weren't united and validated. + inMemoryPairMap = new InMemoryPairEndInfoMap(); + final CloseableIterator> it = pairEndInfoByName.iterator(); + while (it.hasNext()) { + final Map.Entry entry = it.next(); + final PairEndInfo pei = inMemoryPairMap.remove(entry.getValue().readReferenceIndex, entry.getKey()); + if (pei != null) { + // Found a mismatch btw read.mateReferenceIndex and mate.readReferenceIndex + final List errors = pei.validateMates(entry.getValue(), entry.getKey()); + for (final SAMValidationError error : errors) { + addError(error); + } + } else { + // Mate not found. + inMemoryPairMap.put(entry.getValue().mateReferenceIndex, entry.getKey(), entry.getValue()); + } + } + it.close(); + } else { + inMemoryPairMap = (InMemoryPairEndInfoMap) pairEndInfoByName; + } + // At this point, everything in InMemoryMap is a read marked as a pair, for which a mate was not found. + for (final Map.Entry entry : inMemoryPairMap) { + addError(new SAMValidationError(Type.MATE_NOT_FOUND, "Mate not found for paired read", entry.getKey())); + } + } + + /** + * SAM record and quality format validations are combined into a single method because validation must be completed + * in only a single pass of the SamRecords (because a SamReader's iterator() method may not return the same + * records on a subsequent call). + */ + private void validateSamRecordsAndQualityFormat(AsyncCRAMReader reader, final SAMFileHeader header) { +// private void validateSamRecordsAndQualityFormat(final Iterable samRecords, final SAMFileHeader header) { + ProgressLogger progress = null; + try { + progress = new ProgressLogger(log, 10000000, "Validated Read"); + final QualityEncodingDetector qualityDetector = new QualityEncodingDetector(); + while ( ! reader.isIteratorEmpty()) { + final SAMRecord record = reader.take(); + if (null == record) { + if (reader.isIteratorEmpty()) { + break; + } else { + Thread.sleep(10); + } + } else { + + + qualityDetector.add(record); + + final long recordNumber = progress.getCount() + 1; + final Collection errors = record.isValid(); + if (errors != null) { + for (final SAMValidationError error : errors) { + error.setRecordNumber(recordNumber); + addError(error); + } + } + + validateMateFields(record, recordNumber); + final boolean hasValidSortOrder = validateSortOrder(record, recordNumber); + validateReadGroup(record, header); + final boolean cigarIsValid = validateCigar(record, recordNumber); + if (cigarIsValid) { + try { + validateNmTag(record, recordNumber); + } catch (SAMException e) { + if (hasValidSortOrder) { + // If a CRAM file has an invalid sort order, the ReferenceFileWalker will throw a + // SAMException due to an out of order request when retrieving reference bases during NM + // tag validation; rethrow the exception only if the sort order is valid, otherwise + // swallow the exception and carry on validating + throw e; + } + } + } + validateSecondaryBaseCalls(record, recordNumber); + validateTags(record, recordNumber); + if (sequenceDictionaryEmptyAndNoWarningEmitted && !record.getReadUnmappedFlag()) { + addError(new SAMValidationError(Type.MISSING_SEQUENCE_DICTIONARY, "Sequence dictionary is empty", null)); + sequenceDictionaryEmptyAndNoWarningEmitted = false; + + } + + if ((qualityNotStoredErrorCount++ < MAX_QUALITY_NOT_STORED_ERRORS) && record.getBaseQualityString().equals("*")) { + addError(new SAMValidationError(Type.QUALITY_NOT_STORED, + "QUAL field is set to * (unspecified quality scores), this is allowed by the SAM" + + " specification but many tools expect reads to include qualities ", + record.getReadName(), recordNumber)); + } + + progress.record(record); + } + } + + try { + if (progress.getCount() > 0) { // Avoid exception being thrown as a result of no qualities being read + final FastqQualityFormat format = qualityDetector.generateBestGuess(QualityEncodingDetector.FileContext.SAM, FastqQualityFormat.Standard); + if (format != FastqQualityFormat.Standard) { + addError(new SAMValidationError(Type.INVALID_QUALITY_FORMAT, String.format("Detected %s quality score encoding, but expected %s.", format, FastqQualityFormat.Standard), null)); + } + } + } catch (SAMException e) { + addError(new SAMValidationError(Type.INVALID_QUALITY_FORMAT, e.getMessage(), null)); + } + } catch (SAMFormatException e) { + // increment record number because the iterator behind the SamReader + // reads one record ahead so we will get this failure one record ahead + assert progress != null; + final String msg = "SAMFormatException on record " + progress.getCount() + 1; + out.println(msg); + throw new SAMException(msg, e); + } catch (FileTruncatedException e) { + addError(new SAMValidationError(Type.TRUNCATED_FILE, "File is truncated", null)); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + + private void validateReadGroup(final SAMRecord record, final SAMFileHeader header) { + final SAMReadGroupRecord rg = record.getReadGroup(); + if (rg == null) { + addError(new SAMValidationError(Type.RECORD_MISSING_READ_GROUP, + "A record is missing a read group", record.getReadName())); + } else if (header.getReadGroup(rg.getId()) == null) { + addError(new SAMValidationError(Type.READ_GROUP_NOT_FOUND, + "A record has a read group not found in the header: ", + record.getReadName() + ", " + rg.getReadGroupId())); + } + } + + /** + * Report error if a tag value is a Long, or if there's a duplicate dag, + * or if there's a CG tag is observed (CG tags are converted to cigars in + * the bam code, and should not appear in other formats) + */ + private void validateTags(final SAMRecord record, final long recordNumber) { + final List attributes = record.getAttributes(); + + final Set tags = new HashSet<>(attributes.size()); + + for (final SAMRecord.SAMTagAndValue tagAndValue : attributes) { + if (tagAndValue.value instanceof Long) { + addError(new SAMValidationError(Type.TAG_VALUE_TOO_LARGE, + "Numeric value too large for tag " + tagAndValue.tag, + record.getReadName(), recordNumber)); + } + + if (!tags.add(tagAndValue.tag)) { + addError(new SAMValidationError(Type.DUPLICATE_SAM_TAG, + "Duplicate SAM tag (" + tagAndValue.tag + ") found.", record.getReadName(), recordNumber)); + } + } + + if (tags.contains(SAMTag.CG.name())){ + addError(new SAMValidationError(Type.CG_TAG_FOUND_IN_ATTRIBUTES, + "The CG Tag should only be used in BAM format to hold a large cigar. " + + "It was found containing the value: " + + record.getAttribute(SAMTag.CG), record.getReadName(), recordNumber)); + } + } + + private void validateSecondaryBaseCalls(final SAMRecord record, final long recordNumber) { + final String e2 = (String) record.getAttribute(SAMTag.E2); + if (e2 != null) { + if (e2.length() != record.getReadLength()) { + addError(new SAMValidationError(Type.MISMATCH_READ_LENGTH_AND_E2_LENGTH, + String.format("E2 tag length (%d) != read length (%d)", e2.length(), record.getReadLength()), + record.getReadName(), recordNumber)); + } + final byte[] bases = record.getReadBases(); + final byte[] secondaryBases = StringUtil.stringToBytes(e2); + for (int i = 0; i < Math.min(bases.length, secondaryBases.length); ++i) { + if (SequenceUtil.isNoCall(bases[i]) || SequenceUtil.isNoCall(secondaryBases[i])) { + continue; + } + if (SequenceUtil.basesEqual(bases[i], secondaryBases[i])) { + addError(new SAMValidationError(Type.E2_BASE_EQUALS_PRIMARY_BASE, + String.format("Secondary base call (%c) == primary base call (%c)", + (char) secondaryBases[i], (char) bases[i]), + record.getReadName(), recordNumber)); + break; + } + } + } + final String u2 = (String) record.getAttribute(SAMTag.U2); + if (u2 != null && u2.length() != record.getReadLength()) { + addError(new SAMValidationError(Type.MISMATCH_READ_LENGTH_AND_U2_LENGTH, + String.format("U2 tag length (%d) != read length (%d)", u2.length(), record.getReadLength()), + record.getReadName(), recordNumber)); + } + } + + private boolean validateCigar(final SAMRecord record, final long recordNumber) { + return record.getReadUnmappedFlag() || validateCigar(record, recordNumber, true); + } + + private boolean validateMateCigar(final SAMRecord record, final long recordNumber) { + return validateCigar(record, recordNumber, false); + } + + private boolean validateCigar(final SAMRecord record, final long recordNumber, final boolean isReadCigar) { + final ValidationStringency savedStringency = record.getValidationStringency(); + record.setValidationStringency(ValidationStringency.LENIENT); + final List errors = isReadCigar ? record.validateCigar(recordNumber) : SAMUtils.validateMateCigar(record, recordNumber); + record.setValidationStringency(savedStringency); + if (errors == null) { + return true; + } + boolean valid = true; + for (final SAMValidationError error : errors) { + addError(error); + valid = false; + } + return valid; + } + + private boolean validateSortOrder(final SAMRecord record, final long recordNumber) { + final SAMRecord prev = orderChecker.getPreviousRecord(); + boolean isValidSortOrder = orderChecker.isSorted(record); + if (!isValidSortOrder) { + addError(new SAMValidationError( + Type.RECORD_OUT_OF_ORDER, + String.format( + "The record is out of [%s] order, prior read name [%s], prior coordinates [%d:%d]", + record.getHeader().getSortOrder().name(), + prev.getReadName(), + prev.getReferenceIndex(), + prev.getAlignmentStart()), + record.getReadName(), + recordNumber)); + } + return isValidSortOrder; + } + + private void init(final ReferenceSequenceFile reference, final SAMFileHeader header) { + if (header.getSortOrder() == SAMFileHeader.SortOrder.coordinate) { + this.pairEndInfoByName = new CoordinateSortedPairEndInfoMap(); + } else { + this.pairEndInfoByName = new InMemoryPairEndInfoMap(); + } + if (reference != null) { + this.refFileWalker = new ReferenceSequenceFileWalker(reference); + this.samSequenceDictionary = reference.getSequenceDictionary(); + } + } + + private void cleanup() { + this.errorsByType = null; + this.pairEndInfoByName = null; + this.refFileWalker = null; + } + + private void validateNmTag(final SAMRecord record, final long recordNumber) { + if (!record.getReadUnmappedFlag()) { + final Integer tagNucleotideDiffs = record.getIntegerAttribute(ReservedTagConstants.NM); + if (tagNucleotideDiffs == null) { + addError(new SAMValidationError( + Type.MISSING_TAG_NM, + "NM tag (nucleotide differences) is missing", + record.getReadName(), + recordNumber)); + } else if (refFileWalker != null) { + final ReferenceSequence refSequence = refFileWalker.get(record.getReferenceIndex()); + final int actualNucleotideDiffs = SequenceUtil.calculateSamNmTag(record, refSequence.getBases(), + 0, isBisulfiteSequenced()); + + if (!tagNucleotideDiffs.equals(actualNucleotideDiffs)) { + addError(new SAMValidationError( + Type.INVALID_TAG_NM, + "NM tag (nucleotide differences) in file [" + tagNucleotideDiffs + + "] does not match reality [" + actualNucleotideDiffs + "]", + record.getReadName(), + recordNumber)); + } + } + } + } + + private void validateMateFields(final SAMRecord record, final long recordNumber) { + if (!record.getReadPairedFlag() || record.isSecondaryOrSupplementary()) { + return; + } + validateMateCigar(record, recordNumber); + + if (skipMateValidation) { + return; + } + + final PairEndInfo pairEndInfo = pairEndInfoByName.remove(record.getReferenceIndex(), record.getReadName()); + if (pairEndInfo == null) { + pairEndInfoByName.put(record.getMateReferenceIndex(), record.getReadName(), new PairEndInfo(record, recordNumber)); + } else { + final List errors = + pairEndInfo.validateMates(new PairEndInfo(record, recordNumber), record.getReadName()); + for (final SAMValidationError error : errors) { + addError(error); + } + } + } + + private void validateHeader(final SAMFileHeader fileHeader) { + for (final SAMValidationError error : fileHeader.getValidationErrors()) { + addError(error); + } + if (fileHeader.getVersion() == null) { + addError(new SAMValidationError(Type.MISSING_VERSION_NUMBER, "Header has no version number", null)); + } else if (!SAMFileHeader.ACCEPTABLE_VERSIONS.contains(fileHeader.getVersion())) { + addError(new SAMValidationError(Type.INVALID_VERSION_NUMBER, "Header version: " + + fileHeader.getVersion() + " does not match any of the acceptable versions: " + + StringUtil.join(", ", SAMFileHeader.ACCEPTABLE_VERSIONS.toArray(new String[0])), + null)); + } + if (fileHeader.getSequenceDictionary().isEmpty()) { + sequenceDictionaryEmptyAndNoWarningEmitted = true; + } else { + if (samSequenceDictionary != null) { + if (!fileHeader.getSequenceDictionary().isSameDictionary(samSequenceDictionary)) { + addError(new SAMValidationError(Type.MISMATCH_FILE_SEQ_DICT, "Mismatch between file and sequence dictionary", null)); + } + } + + final List longSeqs = fileHeader.getSequenceDictionary().getSequences().stream() + .filter(s -> s.getSequenceLength() > GenomicIndexUtil.BIN_GENOMIC_SPAN).collect(Collectors.toList()); + + if (!longSeqs.isEmpty()) { + final String msg = "Reference sequences are too long for BAI indexing: " + StringUtil.join(", ", longSeqs); + addError(new SAMValidationError(Type.REF_SEQ_TOO_LONG_FOR_BAI, msg, null)); + } + } + if (fileHeader.getReadGroups().isEmpty()) { + addError(new SAMValidationError(Type.MISSING_READ_GROUP, "Read groups is empty", null)); + } + final List pgs = fileHeader.getProgramRecords(); + for (int i = 0; i < pgs.size() - 1; i++) { + for (int j = i + 1; j < pgs.size(); j++) { + if (pgs.get(i).getProgramGroupId().equals(pgs.get(j).getProgramGroupId())) { + addError(new SAMValidationError(Type.DUPLICATE_PROGRAM_GROUP_ID, "Duplicate " + + "program group id: " + pgs.get(i).getProgramGroupId(), null)); + } + } + } + + final List rgs = fileHeader.getReadGroups(); + final Set readGroupIDs = new HashSet<>(); + + for (final SAMReadGroupRecord record : rgs) { + final String readGroupID = record.getReadGroupId(); + if (readGroupIDs.contains(readGroupID)) { + addError(new SAMValidationError(Type.DUPLICATE_READ_GROUP_ID, "Duplicate " + + "read group id: " + readGroupID, null)); + } else { + readGroupIDs.add(readGroupID); + } + + final String platformValue = record.getPlatform(); + if (platformValue == null || platformValue.isEmpty()) { + addError(new SAMValidationError(Type.MISSING_PLATFORM_VALUE, + "A platform (PL) attribute was not found for read group ", + readGroupID)); + } else { + // NB: cannot be null, so not catching a NPE + try { + SAMReadGroupRecord.PlatformValue.valueOf(platformValue.toUpperCase()); + } catch (IllegalArgumentException e) { + addError(new SAMValidationError(Type.INVALID_PLATFORM_VALUE, + "The platform (PL) attribute (" + platformValue + ") + was not one of the valid values for read group ", + readGroupID)); + } + } + } + } + + /** + * Number of warnings during SAM file validation + * + * @return number of warnings + */ + public int getNumWarnings() { + return this.numWarnings; + } + + /** + * Number of errors during SAM file validation + * + * @return number of errors + */ + public int getNumErrors() { + return this.numErrors; + } + + private void addError(final SAMValidationError error) { + // Just ignore an error if it's of a type we're not interested in + if (this.errorsToIgnore.contains(error.getType())) return; + + switch (error.getType().severity) { + case WARNING: + if (this.ignoreWarnings) { + return; + } + this.numWarnings++; + break; + case ERROR: + this.numErrors++; + break; + default: + throw new SAMException("Unknown SAM validation error severity: " + error.getType().severity); + } + + this.errorsByType.increment(error.getType()); + if (verbose) { + out.println(error); + out.flush(); + if (this.errorsByType.getCount() >= maxVerboseOutput) { + throw new MaxOutputExceededException(); + } + } + } + + /** + * Control verbosity + * + * @param verbose True in order to emit a message per error or warning. + * @param maxVerboseOutput If verbose, emit no more than this many messages. Ignored if !verbose. + */ + public void setVerbose(final boolean verbose, final int maxVerboseOutput) { + this.verbose = verbose; + this.maxVerboseOutput = maxVerboseOutput; + } + + public boolean isBisulfiteSequenced() { + return bisulfiteSequenced; + } + + public void setBisulfiteSequenced(boolean bisulfiteSequenced) { + this.bisulfiteSequenced = bisulfiteSequenced; + } + + /** + * @deprecated use {@link #setIndexValidationStringency} instead + */ + @Deprecated + public SamFileValidator setValidateIndex(final boolean validateIndex) { + // The SamReader must also have IndexCaching enabled to have the index validated, + return this.setIndexValidationStringency(validateIndex ? IndexValidationStringency.EXHAUSTIVE : IndexValidationStringency.NONE); + } + + public SamFileValidator setIndexValidationStringency(final IndexValidationStringency stringency) { + this.indexValidationStringency = stringency; + return this; + } + + public static class ValidationMetrics extends MetricBase { + } + + /** + * This class is used so we don't have to store the entire SAMRecord in memory while we wait + * to find a record's mate and also to store the record number. + */ + private static class PairEndInfo { + private final int readAlignmentStart; + private final int readReferenceIndex; + private final boolean readNegStrandFlag; + private final boolean readUnmappedFlag; + private final String readCigarString; + + private final int mateAlignmentStart; + private final int mateReferenceIndex; + private final boolean mateNegStrandFlag; + private final boolean mateUnmappedFlag; + private final String mateCigarString; + + private final boolean firstOfPairFlag; + + private final long recordNumber; + + public PairEndInfo(final SAMRecord record, final long recordNumber) { + this.recordNumber = recordNumber; + + this.readAlignmentStart = record.getAlignmentStart(); + this.readNegStrandFlag = record.getReadNegativeStrandFlag(); + this.readReferenceIndex = record.getReferenceIndex(); + this.readUnmappedFlag = record.getReadUnmappedFlag(); + this.readCigarString = record.getCigarString(); + + this.mateAlignmentStart = record.getMateAlignmentStart(); + this.mateNegStrandFlag = record.getMateNegativeStrandFlag(); + this.mateReferenceIndex = record.getMateReferenceIndex(); + this.mateUnmappedFlag = record.getMateUnmappedFlag(); + final Object mcs = record.getAttribute(SAMTag.MC); + this.mateCigarString = (mcs != null) ? (String) mcs : null; + + this.firstOfPairFlag = record.getFirstOfPairFlag(); + } + + private PairEndInfo(final int readAlignmentStart, final int readReferenceIndex, final boolean readNegStrandFlag, final boolean readUnmappedFlag, + final String readCigarString, + final int mateAlignmentStart, final int mateReferenceIndex, final boolean mateNegStrandFlag, final boolean mateUnmappedFlag, + final String mateCigarString, final boolean firstOfPairFlag, final long recordNumber) { + this.readAlignmentStart = readAlignmentStart; + this.readReferenceIndex = readReferenceIndex; + this.readNegStrandFlag = readNegStrandFlag; + this.readUnmappedFlag = readUnmappedFlag; + this.readCigarString = readCigarString; + this.mateAlignmentStart = mateAlignmentStart; + this.mateReferenceIndex = mateReferenceIndex; + this.mateNegStrandFlag = mateNegStrandFlag; + this.mateUnmappedFlag = mateUnmappedFlag; + this.mateCigarString = mateCigarString; + this.firstOfPairFlag = firstOfPairFlag; + this.recordNumber = recordNumber; + } + + public List validateMates(final PairEndInfo mate, final String readName) { + final List errors = new ArrayList<>(); + validateMateFields(this, mate, readName, errors); + validateMateFields(mate, this, readName, errors); + // Validations that should not be repeated on both ends + if (this.firstOfPairFlag == mate.firstOfPairFlag) { + final String whichEnd = this.firstOfPairFlag ? "first" : "second"; + errors.add(new SAMValidationError( + Type.MATES_ARE_SAME_END, + "Both mates are marked as " + whichEnd + " of pair", + readName, + this.recordNumber + )); + } + return errors; + } + + private void validateMateFields(final PairEndInfo end1, final PairEndInfo end2, final String readName, final List errors) { + if (end1.mateAlignmentStart != end2.readAlignmentStart) { + errors.add(new SAMValidationError( + Type.MISMATCH_MATE_ALIGNMENT_START, + "Mate alignment does not match alignment start of mate", + readName, + end1.recordNumber)); + } + if (end1.mateNegStrandFlag != end2.readNegStrandFlag) { + errors.add(new SAMValidationError( + Type.MISMATCH_FLAG_MATE_NEG_STRAND, + "Mate negative strand flag does not match read negative strand flag of mate", + readName, + end1.recordNumber)); + } + if (end1.mateReferenceIndex != end2.readReferenceIndex) { + errors.add(new SAMValidationError( + Type.MISMATCH_MATE_REF_INDEX, + "Mate reference index (MRNM) does not match reference index of mate", + readName, + end1.recordNumber)); + } + if (end1.mateUnmappedFlag != end2.readUnmappedFlag) { + errors.add(new SAMValidationError( + Type.MISMATCH_FLAG_MATE_UNMAPPED, + "Mate unmapped flag does not match read unmapped flag of mate", + readName, + end1.recordNumber)); + } + if ((end1.mateCigarString != null) && (!end1.mateCigarString.equals(end2.readCigarString))) { + errors.add(new SAMValidationError( + Type.MISMATCH_MATE_CIGAR_STRING, + "Mate CIGAR string does not match CIGAR string of mate", + readName, + end1.recordNumber)); + } + // Note - don't need to validate that the mateCigarString is a valid cigar string, since this + // will be validated by validateCigar on the mate's record itself. + } + } + + /** + * Thrown in addError indicating that maxVerboseOutput has been exceeded and processing should stop + */ + private static class MaxOutputExceededException extends SAMException { + MaxOutputExceededException() { + super("maxVerboseOutput exceeded."); + } + } + + interface PairEndInfoMap extends Iterable> { + void put(int mateReferenceIndex, String key, PairEndInfo value); + + PairEndInfo remove(int mateReferenceIndex, String key); + + @Override + CloseableIterator> iterator(); + } + + private class CoordinateSortedPairEndInfoMap implements PairEndInfoMap { + private final CoordinateSortedPairInfoMap onDiskMap = + new CoordinateSortedPairInfoMap<>(maxTempFiles, new Codec()); + + @Override + public void put(int mateReferenceIndex, String key, PairEndInfo value) { + onDiskMap.put(mateReferenceIndex, key, value); + } + + @Override + public PairEndInfo remove(int mateReferenceIndex, String key) { + return onDiskMap.remove(mateReferenceIndex, key); + } + + @Override + public CloseableIterator> iterator() { + return onDiskMap.iterator(); + } + + private static class Codec implements CoordinateSortedPairInfoMap.Codec { + private DataInputStream in; + private DataOutputStream out; + + @Override + public void setOutputStream(final OutputStream os) { + this.out = new DataOutputStream(os); + } + + @Override + public void setInputStream(final InputStream is) { + this.in = new DataInputStream(is); + } + + @Override + public void encode(final String key, final PairEndInfo record) { + try { + out.writeUTF(key); + out.writeInt(record.readAlignmentStart); + out.writeInt(record.readReferenceIndex); + out.writeBoolean(record.readNegStrandFlag); + out.writeBoolean(record.readUnmappedFlag); + out.writeUTF(record.readCigarString); + out.writeInt(record.mateAlignmentStart); + out.writeInt(record.mateReferenceIndex); + out.writeBoolean(record.mateNegStrandFlag); + out.writeBoolean(record.mateUnmappedFlag); + // writeUTF can't take null, so store a null mateCigarString as an empty string + out.writeUTF(record.mateCigarString != null ? record.mateCigarString : ""); + out.writeBoolean(record.firstOfPairFlag); + out.writeLong(record.recordNumber); + } catch (IOException e) { + throw new SAMException("Error spilling PairInfo to disk", e); + } + } + + @Override + public Map.Entry decode() { + try { + final String key = in.readUTF(); + final int readAlignmentStart = in.readInt(); + final int readReferenceIndex = in.readInt(); + final boolean readNegStrandFlag = in.readBoolean(); + final boolean readUnmappedFlag = in.readBoolean(); + final String readCigarString = in.readUTF(); + + final int mateAlignmentStart = in.readInt(); + final int mateReferenceIndex = in.readInt(); + final boolean mateNegStrandFlag = in.readBoolean(); + final boolean mateUnmappedFlag = in.readBoolean(); + + // read mateCigarString - note that null value is stored as an empty string + final String mcs = in.readUTF(); + final String mateCigarString = !mcs.isEmpty() ? mcs : null; + + final boolean firstOfPairFlag = in.readBoolean(); + + final long recordNumber = in.readLong(); + final PairEndInfo rec = new PairEndInfo(readAlignmentStart, readReferenceIndex, readNegStrandFlag, + readUnmappedFlag, readCigarString, mateAlignmentStart, mateReferenceIndex, mateNegStrandFlag, + mateUnmappedFlag, mateCigarString, + firstOfPairFlag, recordNumber); + return new AbstractMap.SimpleEntry<>(key, rec); + } catch (IOException e) { + throw new SAMException("Error reading PairInfo from disk", e); + } + } + } + } + + private static class InMemoryPairEndInfoMap implements PairEndInfoMap { + private final Map map = new HashMap<>(); + + @Override + public void put(int mateReferenceIndex, String key, PairEndInfo value) { + if (mateReferenceIndex != value.mateReferenceIndex) + throw new IllegalArgumentException("mateReferenceIndex does not agree with PairEndInfo"); + map.put(key, value); + } + + @Override + public PairEndInfo remove(int mateReferenceIndex, String key) { + return map.remove(key); + } + + @Override + public CloseableIterator> iterator() { + final Iterator> it = map.entrySet().iterator(); + return new CloseableIterator<>() { + @Override + public void close() { + // do nothing + } + + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public Map.Entry next() { + return it.next(); + } + + @Override + public void remove() { + it.remove(); + } + }; + } + } +} \ No newline at end of file diff --git a/qmule/src/org/qcmg/qmule/ValidateSamFile.java b/qmule/src/org/qcmg/qmule/ValidateSamFile.java new file mode 100644 index 000000000..ac05505fb --- /dev/null +++ b/qmule/src/org/qcmg/qmule/ValidateSamFile.java @@ -0,0 +1,310 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package org.qcmg.qmule; + +import htsjdk.samtools.*; +import htsjdk.samtools.BamIndexValidator.IndexValidationStringency; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Log; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import picard.PicardException; +import picard.cmdline.CommandLineProgram; +import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup; +import picard.nio.PicardHtsPath; + +import java.io.FileNotFoundException; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; + +/** + *

This tool reports on the validity of a SAM or BAM file relative to the SAM format + * specification. This is useful for troubleshooting errors encountered with other tools that may be caused by improper + * formatting, faulty alignments, incorrect flag values, etc.

+ * + *

By default, the tool runs in VERBOSE mode and will exit after finding 100 errors and output them to the console (stdout). + * Therefore, it is often more practical to run this tool initially using the MODE=SUMMARY option. This mode outputs a summary + * table listing the numbers of all 'errors' and 'warnings'.

+ * + *

When fixing errors in your file, it is often useful to prioritize the severe validation errors and ignore the + * errors/warnings of lesser concern. This can be done using the IGNORE and/or IGNORE_WARNINGS arguments. For helpful + * suggestions on error prioritization, please follow this link to obtain additional documentation on ValidateSamFile.

+ * + *

After identifying and fixing your 'warnings/errors', we recommend that you rerun this tool to validate your SAM/BAM + * file prior to proceeding with your downstream analysis. This will verify that all problems in your file have been addressed.

+ * + * This tool is a wrapper for {@link SamFileValidator}. + * + *

Usage example:

+ *
+ * java -jar picard.jar ValidateSamFile \
+ * I=input.bam \
+ * MODE=SUMMARY + *
+ *

To obtain a complete list with descriptions of both 'ERROR' and 'WARNING' messages, please see our additional + * documentation for this tool.

+ *
+ * + * @author Doug Voet + */ +@CommandLineProgramProperties( + summary = ValidateSamFile.USAGE_SUMMARY + ValidateSamFile.USAGE_DETAILS, + oneLineSummary = ValidateSamFile.USAGE_SUMMARY, + programGroup = DiagnosticsAndQCProgramGroup.class) +@DocumentedFeature +public class ValidateSamFile extends CommandLineProgram { + + public static void main(final String[] argv) { + + + int exitStatus = new ValidateSamFile().instanceMain(argv); + + System.exit(exitStatus); + } + private static final Log log = Log.getInstance(ValidateSamFile.class); + static final String USAGE_SUMMARY = "Validates a SAM/BAM/CRAM file."; + static final String USAGE_DETAILS = "

This tool reports on the validity of a SAM/BAM/CRAM file relative to the SAM format " + + "specification. This is useful for troubleshooting errors encountered with other tools that may be caused by improper " + + "formatting, faulty alignments, incorrect flag values, etc.

" + + + "

By default, the tool runs in VERBOSE mode and will exit after finding 100 errors and output them to the console (stdout). " + + "Therefore, it is often more practical to run this tool initially using the MODE=SUMMARY option. This mode outputs a summary " + + "table listing the numbers of all 'errors' and 'warnings'.

"+ + + "

When fixing errors in your file, it is often useful to prioritize the severe validation errors and ignore the " + + "errors/warnings of lesser concern. This can be done using the IGNORE and/or IGNORE_WARNINGS arguments. For helpful " + + "suggestions on error prioritization, please follow this link to obtain additional documentation on ValidateSamFile.

" + + + "

After identifying and fixing your 'warnings/errors', we recommend that you rerun this tool to validate your SAM/BAM " + + "file prior to proceeding with your downstream analysis. This will verify that all problems in your file have been addressed.

" + + "

Usage example:

" + + "
" +
+            "java -jar picard.jar ValidateSamFile \\
" + + " I=input.bam \\
" + + " MODE=SUMMARY" + + "
" + + "

To obtain a complete list with descriptions of both 'ERROR' and 'WARNING' messages, please see our additional " + + "documentation for this tool.

" + + ""+ + "
"+ + "Return codes depend on the errors/warnings discovered:" + + "

"+ + "-1 failed to complete execution\n" + + "0 ran successfully\n" + + "1 warnings but no errors\n" + + "2 errors and warnings\n" + + "3 errors but no warnings"; + + + public enum Mode {VERBOSE, SUMMARY} + + @Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, + doc = "Input SAM/BAM/CRAM file") + public PicardHtsPath INPUT; + + @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, + doc = "Output file or standard out if missing", + optional = true) + public PicardHtsPath OUTPUT; + + @Argument(shortName = "M", + doc = "Mode of output") + public Mode MODE = Mode.VERBOSE; + + @Argument(doc = "List of validation error types to ignore.", optional = true) + public List IGNORE = new ArrayList<>(); + + @Argument(shortName = "MO", + doc = "The maximum number of lines output in verbose mode") + public Integer MAX_OUTPUT = 100; + + @Argument(doc = "If true, only report errors and ignore warnings.") + public boolean IGNORE_WARNINGS = false; + + @Argument(doc = "DEPRECATED. Use INDEX_VALIDATION_STRINGENCY instead. If true and input is " + + "a BAM file with an index file, also validates the index. Until this parameter is retired " + + "VALIDATE INDEX and INDEX_VALIDATION_STRINGENCY must agree on whether to validate the index.") + public boolean VALIDATE_INDEX = true; + + @Argument(doc = "If set to anything other than IndexValidationStringency.NONE and input is " + + "a BAM file with an index file, also validates the index at the specified stringency. " + + "Until VALIDATE_INDEX is retired, VALIDATE INDEX and INDEX_VALIDATION_STRINGENCY " + + "must agree on whether to validate the index.") + public IndexValidationStringency INDEX_VALIDATION_STRINGENCY = IndexValidationStringency.EXHAUSTIVE; + + @Argument(shortName = "BISULFITE", + doc = "Whether the input file consists of bisulfite sequenced reads. " + + "If so, C->T is not counted as an error in computing the value of the NM tag.") + public boolean IS_BISULFITE_SEQUENCED = false; + + @Argument(doc = "Relevant for a coordinate-sorted file containing read pairs only. " + + "Maximum number of file handles to keep open when spilling mate info to disk. " + + "Set this number a little lower than the per-process maximum number of file that may be open. " + + "This number can be found by executing the 'ulimit -n' command on a Unix system.") + public int MAX_OPEN_TEMP_FILES = 8000; + + @Argument(shortName = "SMV", + doc = "If true, this tool will not attempt to validate mate information. " + + "In general cases, this option should not be used. However, in cases where samples have very " + + "high duplication or chimerism rates (> 10%), the mate validation process often requires extremely " + + "large amounts of memory to run, so this flag allows you to forego that check.") + public boolean SKIP_MATE_VALIDATION = false; + + /** + * Return types for doWork() + */ + enum ReturnTypes { + FAILED(-1), // failed to complete execution + SUCCESSFUL(0), // ran successfully + WARNINGS(1), // warnings but no errors + ERRORS_WARNINGS(2), // errors and warnings + ERRORS(3); // errors but no warnings + + private final int value; + + ReturnTypes(final int value) { + this.value = value; + } + + int value() { + return value; + } + } + + /** + * Do the work after command line has been parsed. + * + * @return -1 - failed to complete execution, 0 - ran successfully without warnings or errors, 1 - warnings but no errors, + * 2 - errors and warnings, 3 - errors but no warnings + * + */ + @Override + protected int doWork() { + try { + IOUtil.assertFileIsReadable(INPUT.toPath()); + ReferenceSequenceFile reference = null; + if (REFERENCE_SEQUENCE != null) { + IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); + reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE); + + } else { + log.warn("NM validation cannot be performed without the reference. All other validations will still occur."); + } + final PrintWriter out; + if (OUTPUT != null) { + try { + out = new PrintWriter(Files.newOutputStream(OUTPUT.toPath())); + } catch (FileNotFoundException e) { + // we already asserted this so we should not get here + throw new PicardException("Unexpected exception", e); + } + } else { + out = new PrintWriter(System.out); + } + + boolean result; + + final SamReaderFactory factory = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE) + .validationStringency(ValidationStringency.SILENT) + .enable(SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS); + final SamReader samReader = factory.open(INPUT.toPath()); + + if (samReader.type() != SamReader.Type.BAM_TYPE && samReader.type() != SamReader.Type.BAM_CSI_TYPE) VALIDATE_INDEX = false; + + factory.setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, VALIDATE_INDEX); + factory.reapplyOptions(samReader); + + final SamFileValidator validator = getSamFileValidator(out); + if (IOUtil.isRegularPath(INPUT.toPath())) { + // Do not check termination if reading from a stream + validator.validateBamFileTermination(INPUT.toPath()); + } + + result = switch (MODE) { + case SUMMARY -> validator.validateSamFileSummary(samReader, reference, INPUT.toPath(), REFERENCE_SEQUENCE); + case VERBOSE -> validator.validateSamFileVerbose(samReader, reference, INPUT.toPath(), REFERENCE_SEQUENCE); + }; + out.flush(); + + if (result) { + return ReturnTypes.SUCCESSFUL.value(); // ran successfully with no warnings or errors + } else { + if (validator.getNumErrors() == 0) { + if (validator.getNumWarnings() > 0) { + return ReturnTypes.WARNINGS.value(); // warnings but no errors + } else { + log.error("SAM file validation fails without warnings or errors."); + return ReturnTypes.FAILED.value(); + } + } else { + if (validator.getNumWarnings() > 0) { + return ReturnTypes.ERRORS_WARNINGS.value(); // errors and warnings + } else { + return ReturnTypes.ERRORS.value(); // errors but no warnings + } + } + } + } catch (Exception e) { + log.error(e.getMessage()); + return ReturnTypes.FAILED.value(); // failed to complete execution + } + } + + private SamFileValidator getSamFileValidator(PrintWriter out) { + final SamFileValidator validator = new SamFileValidator(out, MAX_OPEN_TEMP_FILES); + validator.setErrorsToIgnore(IGNORE); + validator.setSkipMateValidation(SKIP_MATE_VALIDATION); + validator.setBisulfiteSequenced(IS_BISULFITE_SEQUENCED); + validator.setIgnoreWarnings(IGNORE_WARNINGS); + + if (MODE == Mode.SUMMARY) { + validator.setVerbose(false, 0); + } else { + validator.setVerbose(true, MAX_OUTPUT); + } + if (VALIDATE_INDEX) { + validator.setIndexValidationStringency(IndexValidationStringency.EXHAUSTIVE); + } + return validator; + } + + @Override + protected String[] customCommandLineValidation() { + if ((!VALIDATE_INDEX && INDEX_VALIDATION_STRINGENCY != IndexValidationStringency.NONE) || + (VALIDATE_INDEX && INDEX_VALIDATION_STRINGENCY == IndexValidationStringency.NONE)) { + return new String[]{"VALIDATE_INDEX and INDEX_VALIDATION_STRINGENCY must be consistent: " + + "VALIDATE_INDEX is " + VALIDATE_INDEX + " and INDEX_VALIDATION_STRINGENCY is " + + INDEX_VALIDATION_STRINGENCY}; + } + + return super.customCommandLineValidation(); + } +} \ No newline at end of file diff --git a/qpicard/build.gradle b/qpicard/build.gradle index a5a68ee69..8181c1f33 100644 --- a/qpicard/build.gradle +++ b/qpicard/build.gradle @@ -1,9 +1,18 @@ def isExecutable = false apply plugin: 'java-library' +repositories { + mavenCentral() + flatDir { + dirs '../lib' + } +} + dependencies { implementation project(':qcommon') implementation project(':qio') - api 'com.github.samtools:htsjdk:4.0.2' + //api 'com.github.samtools:htsjdk:4.0.2' + // Reference the JAR file from the local repository + api files('../lib/htsjdk-V1.0.0-fork.jar') }