diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala index 59319c56d4..1f9f2d95de 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala @@ -21,8 +21,6 @@ import java.util.logging.Level._ import org.apache.spark.Logging import org.bdgenomics.adam.util.ParquetLogger import org.bdgenomics.utils.cli._ -import scala.Some -import scala.collection.mutable.ListBuffer object ADAMMain extends Logging { diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala index caf33b0e66..a56eb44e2c 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountReadKmers.scala @@ -44,8 +44,6 @@ class CountReadKmersArgs extends Args4jBase with ParquetArgs { var outputPath: String = null @Argument(required = true, metaVar = "KMER_LENGTH", usage = "Length of k-mers", index = 2) var kmerLength: Int = 0 - @Args4jOption(required = false, name = "-countQmers", usage = "Counts q-mers instead of k-mers.") - var countQmers: Boolean = false @Args4jOption(required = false, name = "-printHistogram", usage = "Prints a histogram of counts.") var printHistogram: Boolean = false @Args4jOption(required = false, name = "-repartition", usage = "Set the number of partitions to map data to") @@ -71,11 +69,7 @@ class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCom } // count kmers - val countedKmers = if (args.countQmers) { - adamRecords.adamCountQmers(args.kmerLength) - } else { - adamRecords.adamCountKmers(args.kmerLength).map(kv => (kv._1, kv._2.toDouble)) - } + val countedKmers = adamRecords.adamCountKmers(args.kmerLength) // cache counted kmers countedKmers.cache() @@ -90,11 +84,7 @@ class CountReadKmers(protected val args: CountReadKmersArgs) extends BDGSparkCom } // save as text file - if (args.countQmers) { - countedKmers.map(kv => kv._1 + ", " + kv._2) - } else { - countedKmers.map(kv => kv._1 + ", " + kv._2.toLong) - }.saveAsTextFile(args.outputPath) + countedKmers.saveAsTextFile(args.outputPath) } } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala index 5f23de0b8c..b2efff2467 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala @@ -66,20 +66,6 @@ class TransformArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs { var lodThreshold = 5.0 @Args4jOption(required = false, name = "-max_target_size", usage = "The maximum length of a target region to attempt realigning. Default length is 3000.") var maxTargetSize = 3000 - @Args4jOption(required = false, name = "-trimReads", usage = "Apply a fixed trim to the prefix and suffix of all reads/reads in a specific read group.") - var trimReads: Boolean = false - @Args4jOption(required = false, name = "-trimFromStart", usage = "Trim to be applied to start of read.") - var trimStart: Int = 0 - @Args4jOption(required = false, name = "-trimFromEnd", usage = "Trim to be applied to end of read.") - var trimEnd: Int = 0 - @Args4jOption(required = false, name = "-trimReadGroup", usage = "Read group to be trimmed. If omitted, all reads are trimmed.") - var trimReadGroup: String = null - @Args4jOption(required = false, name = "-qualityBasedTrim", usage = "Trims reads based on quality scores of prefix/suffixes across read group.") - var qualityBasedTrim: Boolean = false - @Args4jOption(required = false, name = "-qualityThreshold", usage = "Phred scaled quality threshold used for trimming. If omitted, Phred 20 is used.") - var qualityThreshold: Int = 20 - @Args4jOption(required = false, name = "-trimBeforeBQSR", usage = "Performs quality based trim before running BQSR. Default is to run quality based trim after BQSR.") - var trimBeforeBQSR: Boolean = false @Args4jOption(required = false, name = "-repartition", usage = "Set the number of partitions to map data to") var repartition: Int = -1 @Args4jOption(required = false, name = "-coalesce", usage = "Set the number of partitions written to the ADAM output directory") @@ -109,16 +95,6 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans adamRecords = adamRecords.repartition(args.repartition) } - if (args.trimReads) { - log.info("Trimming reads.") - adamRecords = adamRecords.adamTrimReads(args.trimStart, args.trimEnd, args.trimReadGroup) - } - - if (args.qualityBasedTrim && args.trimBeforeBQSR) { - log.info("Applying quality based trim.") - adamRecords = adamRecords.adamTrimLowQualityReadGroups(args.qualityThreshold) - } - if (args.markDuplicates) { log.info("Marking duplicates") adamRecords = adamRecords.adamMarkDuplicates() @@ -144,11 +120,6 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans adamRecords = adamRecords.adamBQSR(sc.broadcast(knownSnps), Option(args.observationsPath)) } - if (args.qualityBasedTrim && !args.trimBeforeBQSR) { - log.info("Applying quality based trim.") - adamRecords = adamRecords.adamTrimLowQualityReadGroups(args.qualityThreshold) - } - if (args.coalesce != -1) { log.info("Coalescing the number of partitions to '%d'".format(args.coalesce)) adamRecords = adamRecords.coalesce(args.coalesce, shuffle = true) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/prefixtrie/DNAPrefixTrie.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/prefixtrie/DNAPrefixTrie.scala deleted file mode 100644 index 256cf613bb..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/prefixtrie/DNAPrefixTrie.scala +++ /dev/null @@ -1,412 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.algorithms.prefixtrie - -import scala.collection.TraversableOnce - -object DNAPrefixTrie { - - /** - * Creates a fixed depth prefix trie, and populates it with a set of - * key->value mappings. Keys containing ambiguous bases are not added to the - * collection. - * - * @param init A map specifying key->value mappings. Used for populating the - * initial trie. - * @tparam V The type of the values stored in the trie. - * @return Returns a populated trie. - * - * @throws AssertionError An assertion error is thrown if the length of the - * keys provided varies or if the input is an empty map. - */ - def apply[V](init: Map[String, V]): DNAPrefixTrie[V] = { - assert(init.size > 0, "Cannot build empty prefix trie.") - - // keys must all be the same length - val len = init.head._1.length - assert(init.forall(kv => kv._1.length == len)) - - // if key length is greater than 1, start nesting - if (len > 1) { - new NestedFixedDepthDNAPrefixTrie(init) - } else { - new FixedDepthDNAPrefixTrieValues(init) - } - } -} - -/** - * This trait represents a prefix trie that stores a mapping between string keys representing - * DNA sequences, and values of any type. Classes that implement this trait are assumed to be - * immutable. For DNA sequences, we allow the storage of unambiguous bases ("ACTG"). Search - * methods accept the unambiguous bases and wildcards ("N" or "*"). - * - * @tparam V The type of the values stored in the trie. - */ -trait DNAPrefixTrie[V] extends Serializable { - - /** - * Checks whether this tree contains a specific key. The contains method accepts wildcards - * (either '*' or 'N') for doing a wildcard content check. - * - * @param key Key to check for the existance of. - * @return Returns true if the key is found. - */ - def contains(key: String): Boolean - - /** - * Gets the value of a key that is contained in the tree. This method throws an exception - * if the key is not found in the tree. This method and all other getters do not take wildcards. - * - * @param key Key to search for the value of. - * @tparam V Type of the values in this collection. - * @return Returns the value corresponding to this key. - * - * @throws IllegalArgumentException Throws an exception if the key is not in the collection. - * - * @see getOrElse - * @see getIfExists - * @see search - */ - def get(key: String): V - - /** - * Gets the value of a key that is contained in the tree. If this key is not found, a user-provided - * default value is returned. This method and all other getters do not take wildcards. - * - * @param key Key to search for the value of. - * @param default Default value to return if the key is not found. - * @tparam V Type of the values in this collection. - * @return Returns the value corresponding to this key, or a default value if the key is not found. - * - * @see get - * @see getIfExists - * @see search - */ - def getOrElse(key: String, default: V): V - - /** - * Gets the value of a key that is contained in the tree, wrapped as an option. None is returned - * if the key is not found. This method and all other getters do not take wildcards. - * - * @param key Key to search for the value of. - * @tparam V Type of the values in this collection. - * @return Returns the value wrapped in an option. None is returned if the key is not found. - * - * @see get - * @see getOrElse - * @see search - */ - def getIfExists(key: String): Option[V] - - /** - * Searches for all possible values that match a key, including wildcards. Returns the values - * and keys that match the wildcard in a map. - * - * @param key Key to search for, with wildcards. - * @tparam V Type of the values in this collection. - * @return Returns a map containing all keys that match this search key, and their values. - * - * @see prefixSearch - * @see suffixSearch - */ - def search(key: String): Map[String, V] - - /** - * Searches for all keys that match a given prefix. The prefix can include wildcards. The - * behavior of this method is identical to the other search method. - * - * @param key Key to search for, with wildcards. - * @tparam V Type of the values in this collection. - * @return Returns a map containing all keys that match this search key, and their values. - * - * @see search - * @see suffixSearch - */ - def prefixSearch(key: String): Map[String, V] - - /** - * Searches for all keys that match a given suffix. The suffix can include wildcards. The - * behavior of this method is identical to the other search method. - * - * @param key Key to search for, with wildcards. - * @tparam V Type of the values in this collection. - * @return Returns a map containing all keys that match this search key, and their values. - * - * @see search - * @see prefixSearch - */ - def suffixSearch(key: String): Map[String, V] - - /** - * Find provides identical functionality to search, except does not return a map. This - * enables an optimization for recursive search, where the iterator is converted back to a map - * at the last stage of the search. We do not expose this to users, since this method just - * enables a performance optimization inside of the implementation. - * - * @note This method does not check for correct key length before being called, as it is a - * package private method that should only be called after key length error checking. - * - * @param key Key to search for, with wildcards. - * @tparam V Type of the values in this collection. - * @return Returns a traversable collection containing tuples of all keys that match this search - * key, and their values. - */ - private[prefixtrie] def find(key: String): TraversableOnce[(String, V)] - - /** - * Returns the number of key/value pairs stored in this trie. - * - * @return The count of key/value pairs in this trie. - */ - def size: Int -} - -/** - * This class implements the non-terminal layers of our fixed-depth prefix trie, through a - * recursive structure. Scaladoc is only included for the methods unique to this class, - * otherwise see docs for the DNAPrefixTrie trait. - * - * @tparam V Type of the values stored by this class. - * - * @see DNAPrefixTrie - */ -private[prefixtrie] class NestedFixedDepthDNAPrefixTrie[V](init: Map[String, V]) extends DNAPrefixTrie[V] { - - // the length of keys stored at this level of the nested tree - val len = init.head._1.length - - // fixed length prefix map - val prefixMap: Array[Option[DNAPrefixTrie[V]]] = new Array(4) - - // create tree - val tmp = init.toArray.map(kv => (cToI(kv._1.head), (kv._1.drop(1), kv._2))) - - // loop over all possible entry values - (0 to 3).foreach(i => { - // collect all values that map here - val iVals = tmp.filter(t => t._1 == i).map(t => t._2).toMap - - // if we have elements in - prefixMap(i) = if (iVals.size > 0) { - Some(DNAPrefixTrie(iVals)) - } else { - None - } - }) - - /** - * Converts internal character codes into array indices. - * - * @param c Character to map to indices. - * @return Array indices. - */ - protected def cToI(c: Char): Int = c match { - case 'A' => 0 - case 'C' => 1 - case 'G' => 2 - case 'T' => 3 - case '*' | 'N' => -1 - case _ => throw new IllegalArgumentException("Element " + c + " not an allowable character.") - } - - /** - * Converts internal array indices back to character codes. - * - * @param i Index to map. - * @return Character code for this index. - */ - protected def iToC(i: Int): Char = i match { - case 0 => 'A' - case 1 => 'C' - case 2 => 'G' - case 3 => 'T' - case _ => throw new IllegalArgumentException("Index " + i + " not an allowable index.") - } - - def contains(key: String): Boolean = { - assert(key.length == len, - "Provided key has incorrect length: " + key.length + ", expected " + len) - - // get the mapping bucket - val bucket = cToI(key.head) - val next = key.drop(1) - - // do we need to search all buckets? if we do, search, else go to the bucket - if (bucket == -1) { - prefixMap.flatMap(v => v).map(_.contains(next)).fold(false)(_ || _) - } else { - prefixMap(bucket).fold(false)(_.contains(next)) - } - } - - def getIfExists(key: String): Option[V] = { - assert(key.length == len, - "Provided key has incorrect length: " + key.length + ", expected " + len) - assert(key.head != '*', - "Cannot perform wildcard search for value (key = " + key + ").") - - // get the mapping bucket - val bucket = cToI(key.head) - val next = key.drop(1) - - // map bucket - prefixMap(bucket).fold(None.asInstanceOf[Option[V]])(_.getIfExists(next)) - } - - def getOrElse(key: String, default: V): V = { - assert(key.length == len, - "Provided key has incorrect length: " + key.length + ", expected " + len) - assert(key.head != '*', - "Cannot perform wildcard search for value (key = " + key + ").") - - // get the mapping bucket - val bucket = cToI(key.head) - val next = key.drop(1) - - // fold over bucket - prefixMap(bucket).fold(default)(_.getOrElse(next, default)) - } - - def get(key: String): V = { - val rv = getIfExists(key) - - // check for defined return value - if (rv.isDefined) { - rv.get - } else { - throw new IllegalArgumentException("Key " + key + " is not defined.") - } - } - - private[prefixtrie] def find(key: String): TraversableOnce[(String, V)] = { - // get the mapping bucket - val bucket = cToI(key.head) - val next = key.drop(1) - - // do we need to search all buckets? if we do, don't filter anything, else only filter on key - (0 to 3).filter(i => bucket == -1 || bucket == i) - .flatMap(i => { - // get prepension character - val c = iToC(i) - - // fold to search, and map to prepend - prefixMap(i).fold(Iterator[(String, V)]() - .asInstanceOf[TraversableOnce[(String, V)]])(_.find(next)) - .map(kv => (c + kv._1, kv._2)) - }) - } - - def search(key: String): Map[String, V] = { - assert(key.length == len, - "Provided key has incorrect length: " + key.length + ", expected " + len) - - // call to internal find method - find(key).toMap - } - - def suffixSearch(key: String): Map[String, V] = { - assert(key.length <= len, - "Key (" + key + ") must be less than " + len + " characters long.") - - // append a prefix of wildcards and search - search(("*" * (len - key.length)) + key) - } - - def prefixSearch(key: String): Map[String, V] = { - assert(key.length <= len, - "Key (" + key + ") must be less than " + len + " characters long.") - - // prepend a suffix of wildcards and search - search(key + ("*" * (len - key.length))) - } - - def size: Int = prefixMap.map(_.fold(0)(_.size)).fold(0)(_ + _) -} - -/** - * This class comprises the "lowest" level of our fixed depth trie, and therefore contains the - * values stored in the trie. Scaladoc is only included for the methods unique to this class, - * otherwise see docs for the DNAPrefixTrie trait. - * - * @tparam V Type of the values stored by this class. - * - * @see DNAPrefixTrie - */ -private[prefixtrie] class FixedDepthDNAPrefixTrieValues[V](init: Map[String, V]) extends DNAPrefixTrie[V] { - - // filter out ambiguous bases - protected val values = init.filter(kv => (kv._1 == "A" || - kv._1 == "C" || - kv._1 == "G" || - kv._1 == "T")) - - def contains(key: String): Boolean = values.contains(key) - - def get(key: String): V = values(key) - - def getOrElse(key: String, default: V): V = values.getOrElse(key, default) - - def getIfExists(key: String): Option[V] = { - if (values.contains(key)) { - Some(values(key)) - } else { - None - } - } - - private[prefixtrie] def find(key: String): TraversableOnce[(String, V)] = { - if (key == "*" || key == "N") { - values - } else { - values.filterKeys(_ == key) - }.toIterator - } - - def search(key: String): Map[String, V] = { - assert(key.length == 1, - "Key (" + key + ") must have length 1.") - - find(key).toMap - } - - /** - * Performs a {pre,suf}-fix search on this trie. Since this level of the tree only stores - * keys of length 1, the two operations are identical. If keys of length 0 are provided, - * we perform a wildcard search. - * - * @param key Key to search for. - * @tparam V Type of objects in this tree. - * @return Returns the result of searching for this key at this level of the tree. - */ - private def fixSearch(key: String): Map[String, V] = { - if (key.length == 1) { - find(key).toMap - } else if (key.length == 0) { - values - } else { - throw new IllegalArgumentException("Key (" + key + ") must have length 0 or 1.") - } - } - - // prefix and suffix search are identical for this tree, since we only store keys of length 1 - def prefixSearch(key: String): Map[String, V] = fixSearch(key) - def suffixSearch(key: String): Map[String, V] = fixSearch(key) - - def size: Int = values.size -} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala new file mode 100644 index 0000000000..391809da75 --- /dev/null +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Alphabet.scala @@ -0,0 +1,107 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.models + +import scala.util.Try + +/** + * Created by bryan on 4/17/15. + * + * An alphabet of symbols and related operations + * + */ +trait Alphabet { + + /** the symbols in this alphabet */ + val symbols: Seq[Symbol] + + /** + * flag if symbols are case-sensitive. + * if true, this alphabet will treat symbols representing upper and lower case symbols as distinct + * if false, this alphabet will treat upper or lower case chars as the same for its symbols + */ + val caseSensitive: Boolean + + /** map of the symbol char to the symbol */ + lazy val symbolMap: Map[Char, Symbol] = + if (caseSensitive) + symbols.map(symbol => symbol.label -> symbol).toMap + else + symbols.flatMap(symbol => Seq(symbol.label.toLower -> symbol, symbol.label.toUpper -> symbol)).toMap + + /** + * + * @param s Each char in this string represents a symbol on the alphabet. + * If the char is not in the alphabet then a NoSuchElementException is thrown + * @return the reversed complement of the given string. + * @throws IllegalArgumentException if the string contains a symbol which is not in the alphabet + */ + def reverseComplementExact(s: String): String = { + reverseComplement(s, + (symbol: Char) => throw new IllegalArgumentException("Character %s not found in alphabet.".format(symbol)) + ) + } + + /** + * + * @param s Each char in this string represents a symbol on the alphabet. + * @param notFound If the char is not in the alphabet then this function is called. + * default behavior is to return a new Symbol representing the unknown character, + * so that the unknown char is treated as the complement + * @return the reversed complement of the given string. + */ + def reverseComplement(s: String, notFound: (Char => Symbol) = ((c: Char) => Symbol(c, c))) = { + s.map(x => Try(apply(x)).getOrElse(notFound(x)).complement).reverse + } + + /** number of symbols in the alphabet */ + def size = symbols.size + + /** + * @param c char to lookup as a symbol in this alphabet + * @return the given symbol + */ + def apply(c: Char): Symbol = symbolMap(c) + +} + +/** + * A symbol in an alphabet + * @param label a character which represents the symbol + * @param complement acharacter which represents the complement of the symbol + */ +case class Symbol(label: Char, complement: Char) + +/** + * The standard DNA alphabet with A,T,C, and G + */ +class DNAAlphabet extends Alphabet { + + override val caseSensitive = false + + override val symbols = Seq( + Symbol('A', 'T'), + Symbol('T', 'A'), + Symbol('G', 'C'), + Symbol('C', 'G') + ) +} + +object Alphabet { + val dna = new DNAAlphabet +} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala index e931d7adec..7ae90dc3ca 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala @@ -20,7 +20,6 @@ package org.bdgenomics.adam.models import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.util.SequenceUtils import org.bdgenomics.formats.avro.{ Strand, Feature } /** @@ -100,7 +99,7 @@ case class Transcript(id: String, if (strand) referenceSequence.substring(minStart, maxEnd) else - SequenceUtils.reverseComplement(referenceSequence.substring(minStart, maxEnd)) + Alphabet.dna.reverseComplement(referenceSequence.substring(minStart, maxEnd)) } /** @@ -178,7 +177,7 @@ abstract class BlockExtractable(strand: Boolean, region: ReferenceRegion) if (strand) referenceSequence.substring(region.start.toInt, region.end.toInt) else - SequenceUtils.reverseComplement(referenceSequence.substring(region.start.toInt, region.end.toInt)) + Alphabet.dna.reverseComplement(referenceSequence.substring(region.start.toInt, region.end.toInt)) } /** @@ -222,11 +221,11 @@ object ReferenceUtils { def folder(acc: Seq[ReferenceRegion], tref: ReferenceRegion): Seq[ReferenceRegion] = acc match { case Seq() => Seq(tref) - case (first: ReferenceRegion) +: Seq(rest) => + case (first: ReferenceRegion) +: rest => if (first.overlaps(tref)) - first.hull(tref) +: Seq(rest) + first.hull(tref) +: rest else - tref +: first +: Seq(rest) + tref +: first +: rest } refs.toSeq.sorted.foldLeft(Seq[ReferenceRegion]())(folder) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala index 7bf5b3e882..d01174c231 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala @@ -32,10 +32,8 @@ import org.bdgenomics.adam.algorithms.consensus.{ import org.bdgenomics.adam.converters.AlignmentRecordConverter import org.bdgenomics.adam.instrumentation.Timers._ import org.bdgenomics.adam.models._ -import org.bdgenomics.adam.models.ReferenceRegion._ import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.{ ADAMSaveAnyArgs, ADAMSequenceDictionaryRDDAggregator } -import org.bdgenomics.adam.rdd.read.correction.{ ErrorCorrection, TrimReads } import org.bdgenomics.adam.rdd.read.realignment.RealignIndels import org.bdgenomics.adam.rdd.read.recalibration.BaseQualityRecalibration import org.bdgenomics.adam.rich.RichAlignmentRecord @@ -226,23 +224,6 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) }).reduceByKey((k1: Long, k2: Long) => k1 + k2) } - /** - * Cuts reads into _q_-mers, and then finds the _q_-mer weight. Q-mers are described in: - * - * Kelley, David R., Michael C. Schatz, and Steven L. Salzberg. "Quake: quality-aware detection - * and correction of sequencing errors." Genome Biol 11.11 (2010): R116. - * - * _Q_-mers are _k_-mers weighted by the quality score of the bases in the _k_-mer. - * - * @param qmerLength The value of _q_ to use for cutting _q_-mers. - * @return Returns an RDD containing q-mer/weight pairs. - * - * @see adamCountKmers - */ - def adamCountQmers(qmerLength: Int): RDD[(String, Double)] = { - ErrorCorrection.countQmers(rdd, qmerLength) - } - def adamSortReadsByReferencePosition(): RDD[AlignmentRecord] = SortReads.time { log.info("Sorting reads by reference position") @@ -350,33 +331,6 @@ class AlignmentRecordRDDFunctions(rdd: RDD[AlignmentRecord]) rdd.filter(RichAlignmentRecord(_).tags.exists(_.tag == tagName)) } - /** - * Trims bases from the start and end of all reads in an RDD. - * - * @param trimStart Number of bases to trim from the start of the read. - * @param trimEnd Number of bases to trim from the end of the read. - * @param readGroup Optional parameter specifying which read group to trim. If omitted, - * all reads are trimmed. - * @return Returns an RDD of trimmed reads. - * - * @note Trimming parameters must be >= 0. - */ - def adamTrimReads(trimStart: Int, trimEnd: Int, readGroup: String = null): RDD[AlignmentRecord] = TrimReadsInDriver.time { - TrimReads(rdd, trimStart, trimEnd, readGroup) - } - - /** - * Trims low quality read prefix/suffixes. The average read prefix/suffix quality is - * calculated from the Phred scaled qualities for read bases. We trim suffixes/prefixes - * that are below a user provided threshold. - * - * @param phredThreshold Phred score for trimming. Defaut value is 20. - * @return Returns an RDD of trimmed reads. - */ - def adamTrimLowQualityReadGroups(phredThreshold: Int = 20): RDD[AlignmentRecord] = TrimLowQualityInDriver.time { - TrimReads(rdd, phredThreshold) - } - /** * Saves these AlignmentRecords to two FASTQ files: one for the first mate in each pair, and the other for the second. * diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/correction/ErrorCorrection.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/correction/ErrorCorrection.scala deleted file mode 100644 index ad5d5a9174..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/correction/ErrorCorrection.scala +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd.read.correction - -import org.apache.spark.SparkContext._ -import org.apache.spark.Logging -import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.util.PhredUtils -import org.bdgenomics.formats.avro.AlignmentRecord - -private[rdd] object ErrorCorrection extends Logging { - - val ec = new ErrorCorrection - - /** - * Cuts reads into _q_-mers, and then finds the _q_-mer weight. Q-mers are described in: - * - * Kelley, David R., Michael C. Schatz, and Steven L. Salzberg. "Quake: quality-aware detection - * and correction of sequencing errors." Genome Biol 11.11 (2010): R116. - * - * _Q_-mers are _k_-mers weighted by the quality score of the bases in the _k_-mer. - * - * @param rdd RDD to count q-mers on. - * @param qmerLength The value of _q_ to use for cutting _q_-mers. - * @return Returns an RDD containing q-mer/weight pairs. - */ - def countQmers(rdd: RDD[AlignmentRecord], - qmerLength: Int): RDD[(String, Double)] = { - // generate qmer counts - rdd.flatMap(ec.readToQmers(_, qmerLength)) - .reduceByKey(_ + _) - } -} - -private[correction] class ErrorCorrection extends Serializable with Logging { - - /** - * Cuts a single read into q-mers. - * - * @param read Read to cut. - * @param qmerLength The length of the qmer to cut. - * @return Returns an iterator containing q-mer/weight mappings. - */ - def readToQmers(read: AlignmentRecord, - qmerLength: Int = 20): Iterator[(String, Double)] = { - // get read bases and quality scores - val bases = read.getSequence.toSeq - val scores = read.getQual.toString.toCharArray.map(q => { - PhredUtils.phredToSuccessProbability(q - 33) - }) - - // zip and put into sliding windows to get qmers - bases.zip(scores) - .sliding(qmerLength) - .map(w => { - // bases are first in tuple - val b = w.map(_._1) - - // quals are second - val q = w.map(_._2) - - // reduce bases into string, reduce quality scores - (b.map(_.toString).reduce(_ + _), q.reduce(_ * _)) - }) - } - -} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/correction/TrimReads.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/correction/TrimReads.scala deleted file mode 100644 index dd55f2ebe1..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/correction/TrimReads.scala +++ /dev/null @@ -1,388 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd.read.correction - -import org.apache.spark.Logging -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.RDD -import org.bdgenomics.formats.avro.AlignmentRecord -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.util.PhredUtils -import org.bdgenomics.adam.instrumentation.Timers._ -import scala.annotation.tailrec -import scala.math.{ log => mathLog, exp } - -private[rdd] object TrimReads extends Logging { - - /** - * Finds the optimal trimming length for the start and end of a read, and trims the read. - * - * @param rdd RDD of reads to trim. - * @param phredThreshold Phred score threshold for trimming read start/end. - * @return Returns an RDD of trimmed reads. - */ - def apply(rdd: RDD[AlignmentRecord], phredThreshold: Int): RDD[AlignmentRecord] = { - val tr = new TrimReads - - // get read length - val readLength = rdd.first().getSequence.length - - // map read quality scores into doubles - log.info("Collecting read quality scores.") - val doubleRdd: RDD[((String, Int), Double)] = rdd.flatMap(tr.readToDoubles) - .cache() - - // reduce this by key, and also get counts - log.info("Summarizing quality scores.") - val columnValues = doubleRdd.reduceByKeyLocally(_ + _) - val columnCounts = doubleRdd.countByKey() - - // we are done with doubleRdd - so, unpersist to free memory - doubleRdd.unpersist() - - // get read group ids - val rgIds = columnValues.keys - .map(p => p._1) - .toSet - - // loop per read group - var loopRdd = rdd - rgIds.foreach(rg => { - // get column values and counts per this read group - val qualities: Seq[(Int, Double)] = columnValues.filterKeys(k => k._1 == rg) - .map(kv => (kv._1._2, kv._2)) - .toSeq - .sortBy(kv => kv._1) - val counts: Seq[(Int, Long)] = columnCounts.filterKeys(k => k._1 == rg) - .map(kv => (kv._1._2, kv._2)) - .toSeq - .sortBy(kv => kv._1) - - // zip columns and counts, and take the mean quality score - val meanQuals: Seq[Int] = qualities.zip(counts) - .map(qc => { - val ((p1, qual), (p2, count)) = qc - - // check that positions are the same - assert(p1 == p2, "During zip, value ranks disagree.") - - // convert back to phred - PhredUtils.successProbabilityToPhred(exp(qual / count)) - }) - - // find the number of bases off of the start/end that are below the requested - // phred score threshold - val trimStart = meanQuals.takeWhile(_ < phredThreshold).length - val trimEnd = meanQuals.reverse.takeWhile(_ < phredThreshold).length - - // call trimming function - loopRdd = apply(loopRdd, trimStart, trimEnd, rg) - }) - - // return rdd - loopRdd - } - - /** - * Trims bases from the start and end of reads. - * - * @param rdd An RDD of reads to trim. - * @param trimStart The number of bases to trim from the start of the read. - * @param trimEnd The number of bases to trim from the end of the read. - * @param rg Optional read group ID parameter. Sets which read group to apply this trim to. - * Set to -1 if you would like to trim all reads. - * @return Returns an RDD of trimmed reads. - */ - def apply(rdd: RDD[AlignmentRecord], - trimStart: Int, - trimEnd: Int, - rg: String = null): RDD[AlignmentRecord] = { - assert(trimStart >= 0 && trimEnd >= 0, - "Trim parameters must be positive.") - assert(rdd.first().getSequence.length > trimStart + trimEnd, - "Cannot trim more than the length of the read.") - - log.info("Trimming reads.") - - val tr = new TrimReads - rdd.map(read => { - // only trim reads that are in this read group - if (read.getRecordGroupName == rg || rg == null) { - tr.trimRead(read, trimStart, trimEnd) - } else { - read - } - }) - } -} - -private[correction] class TrimReads extends Serializable { - - /** - * Converts a read into an array of doubles which are the base quality scores - * as success probabilities. - * - * @param read Read to convert. - * @return Returns an array of log scaled success probabilities for each base, zipped with - * the position of the double in the read, and the read group ID. 'null' is returned if the - * read group is not set. - */ - def readToDoubles(read: AlignmentRecord): Array[((String, Int), Double)] = { - val rgIdx: String = read.getRecordGroupName - - read.getQual - .toArray - .map(q => mathLog(PhredUtils.phredToSuccessProbability(q.toInt - 33))) - .zipWithIndex - .map(p => ((rgIdx, p._2), p._1)) - } - - /** - * Trims bases off of an MD tag. - * - * @param mdTag Tag to trim. - * @param trimStart Bases to trim off of the start. - * @param trimEnd Bases to trim off of the end. - * @return Returns the trimmed MD tag. - */ - def trimMdTag(mdTag: String, trimStart: Int, trimEnd: Int): String = { - @tailrec def trimFront(m: String, trim: Int): String = { - if (trim <= 0) { - if (m(0).isLetter) { - "0" + m - } else { - m - } - } else { - val (md: String, t: Int) = if (m(0) == '^') { - (m.drop(1).dropWhile(_.isLetter), trim) - } else if (m(0).isLetter) { - (m.drop(1), trim - 1) - } else if (m(0).isDigit) { - val num = m.takeWhile(_.isDigit).toInt - if (num == 0) { - (m.dropWhile(_.isDigit), trim) - } else { - ((num - 1) + m.dropWhile(_.isDigit), trim - 1) - } - } else { - throw new IllegalArgumentException("Illegal character in MD tag: " + m) - } - - // call recursively - trimFront(md, t) - } - } - - // this should be called on a reversed MD tag, and returns a reversed MD tag - // this is a bit of a smell, but is done because some scala string operations - // are only defined from the start of a string - def trimRear(m: String, trim: Int): String = { - if (trim <= 0) { - if (m(0).isLetter) { - "0" + m - } else { - m - } - } else { - val (md: String, t: Int) = if (m(0).isDigit) { - // get the number of matches at this site - val num = m.takeWhile(_.isDigit).reverse.toInt - - // get the remainder of the MD tag - val remainder = m.dropWhile(_.isDigit) - - // if we have 0 at this location, nothing to trim, just remove and move on - if (num == 0) { - (remainder, trim) - } else { - ((num - 1).toString.reverse + remainder, trim - 1) - } - } else { - // if we have bases here, there is a mismatch or a deletion, so collect bases and parse - val bases = m.takeWhile(!_.isDigit) - - // get updated trim length - val t = if (bases.last == '^') { // we have a deletion - trim // excise deletion - } else if (bases.length == 1) { // we have a mismatch - trim - 1 // trim base - } else { // something went wrong - throw new IllegalArgumentException("Illegal base string in MD tag: " + bases) - } - - (m.dropWhile(!_.isDigit), t) - } - - // call recursively - trimRear(md, t) - } - } - - // call helper functions, we must reverse input and output of trimRear - trimRear(trimFront(mdTag, trimStart).reverse, trimEnd).reverse - } - - /** - * Trims a CIGAR string. This method handles several corner cases: - * - * - When trimming through a deletion/reference skip, the deletion/skip is excised. - * - This method updates the read start when trimming M/X/= bases. - * - * The trimmed segments are replaced with hard clipping. - * - * @param cigar CIGAR string to trim. - * @param trimStart Number of bases to trim from the start of the CIGAR. Must be >= 0. - * @param trimEnd Number of bases to trim from the end of the CIGAR. Must be >= 0. - * @param startPos Start position of the alignment. - * @return Returns a tuple containing the (updated cigar, updated alignment start position). - */ - def trimCigar(cigar: String, trimStart: Int, trimEnd: Int, startPos: Long, endPos: Long): (String, Long, Long) = TrimCigar.time { - @tailrec def trimFront(c: String, trim: Int, start: Long): (String, Long) = { - if (trim <= 0) { - (c, start) - } else { - val count = c.takeWhile(_.isDigit).toInt - val operator = c.dropWhile(_.isDigit).head - val withoutOp = c.dropWhile(_.isDigit).drop(1) - - val (cNew, tNew, sNew) = if (operator == 'D' || operator == 'N') { - // must trim all the way through a reference deletion or skip - (withoutOp, trim, start + count) - } else { - // get updated cigar - val cNew = if (count == 1) { - withoutOp - } else { - (count - 1) + operator.toString + withoutOp - } - - // get updated start - val sNew = if (operator == 'M' || - operator == '=' || - operator == 'X') { - // if we are trimming into an alignment match, we must update the start position - start + 1 - } else { - start - } - - (cNew, trim - 1, sNew) - } - - trimFront(cNew, tNew, sNew) - } - } - - def trimBack(c: String, trim: Int, end: Long): (String, Long) = { - @tailrec def trimBackHelper(ch: String, t: Int, e: Long): (String, Long) = { - if (t <= 0) { - (ch.reverse, e) - } else { - val count = ch.drop(1).takeWhile(_.isDigit).reverse.toInt - val operator = ch.head - val withoutOp = ch.drop(1).dropWhile(_.isDigit) - - def eNext(): Long = { - if (operator == 'M' || - operator == '=' || - operator == 'X') { - // if we are trimming into an alignment match, we must update the start position - e - 1 - } else { - e - } - } - - val (cNew, tNew, eNew) = if (operator == 'D' || operator == 'N') { - // must trim all the way through a reference deletion or skip - (withoutOp, t, e - count) - } else if (count == 1) { - (withoutOp, t - 1, eNext()) - } else { - (operator.toString + (count - 1).toString.reverse + withoutOp, t - 1, eNext()) - } - - trimBackHelper(cNew, tNew, eNew) - } - } - - if (trim <= 0) { - (c, end) - } else { - trimBackHelper(c.reverse, trim, end) - } - } - - // add hard clipping and return - val trimPrefix = if (trimStart > 0) { - trimStart + "H" - } else { - "" - } - val trimSuffix = if (trimEnd > 0) { - trimEnd + "H" - } else { - "" - } - - val (cigarFrontTrimmed, newStart) = trimFront(cigar, trimStart, startPos) - val (cigarBackTrimmed, newEnd) = trimBack(cigarFrontTrimmed, trimEnd, endPos) - - (trimPrefix + cigarBackTrimmed + trimSuffix, newStart, newEnd) - } - - /** - * Trims a read. Updates the read sequence, qualities, start position, and cigar. - * All other fields are copied over from the old read. - * - * @param read Read to trim. - * @param trimStart Number of bases to trim from the start of the read. - * @param trimEnd Number of bases to trim from the end of the read. - * @return Trimmed read. - */ - def trimRead(read: AlignmentRecord, trimStart: Int, trimEnd: Int): AlignmentRecord = TrimRead.time { - // trim sequence and quality value - val seq: String = read.getSequence.toString.drop(trimStart).dropRight(trimEnd) - val qual: String = read.getQual.toString.drop(trimStart).dropRight(trimEnd) - - // make copy builder - val builder = AlignmentRecord.newBuilder(read) - .setSequence(seq) - .setBasesTrimmedFromStart(Option(read.getBasesTrimmedFromStart).fold(0)(v => v) + trimStart) - .setBasesTrimmedFromEnd(Option(read.getBasesTrimmedFromEnd).fold(0)(v => v) + trimEnd) - .setQual(qual) - - // clean up cigar and read start position - Option(read.getCigar).filter(_ != "*") - .map(trimCigar(_, trimStart, trimEnd, read.getStart, read.getEnd)) - .foreach(p => { - val (c, s, e) = p - builder.setCigar(c) - .setStart(s) - .setEnd(e) - - // also, clean up md tag - Option(read.getMismatchingPositions).map(trimMdTag(_, trimStart, trimEnd)) - .foreach(builder.setMismatchingPositions) - }) - - // finish building and return - builder.build() - } -} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/SequenceUtils.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/SequenceUtils.scala deleted file mode 100644 index b2a5d9e451..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/SequenceUtils.scala +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.util - -/** - * This is a hack, and should be replaced! - * See Issue #410 -> https://github.com/bigdatagenomics/adam/issues/410 - * - * Basically, I introduced this as a set of helper functions for suppporting - * the Gene/Transcript model work, which required the ability to extract - * transcript sequences from a chromosomal sequence. - * - * But this should be replaced with a proper "Alphabet" object. --TWD - */ -object SequenceUtils { - - def reverseComplement(dna: String): String = { - val builder = new StringBuilder() - dna.foreach(c => builder.append(complement(c))) - builder.reverse.toString() - } - - def complement(c: Char): Char = { - c match { - case 'A' => 'T' - case 'a' => 't' - case 'G' => 'C' - case 'g' => 'c' - case 'C' => 'G' - case 'c' => 'g' - case 'T' => 'A' - case 't' => 'a' - case _ => c - } - } - -} - diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/algorithms/prefixtrie/DNAPrefixTrieSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/algorithms/prefixtrie/DNAPrefixTrieSuite.scala deleted file mode 100644 index ef9b130055..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/algorithms/prefixtrie/DNAPrefixTrieSuite.scala +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.algorithms.prefixtrie - -import org.scalatest.FunSuite - -class DNAPrefixTrieSuite extends FunSuite { - - test("it should not be possible to create an empty prefix trie") { - intercept[AssertionError] { - val trie = DNAPrefixTrie(Map()) - } - } - - test("can retrieve all values with a completely-wildcard query") { - val trie = DNAPrefixTrie(Map("AA" -> 1, "TT" -> 2, "CC" -> 3)) - assert(trie.size === 3) - assert(trie.find("**").size === 3) - } - - test("building a trie with illegal characters generates an IllegalArgumentException") { - intercept[IllegalArgumentException] { - DNAPrefixTrie(Map("ATMGC" -> 0)) - } - } - - test("kmers with ambiguous bases don't get added to the trie") { - val trie = DNAPrefixTrie(Map("ANCT" -> 0.5, - "ACTN" -> 1.0)) - - assert(trie.size === 0) - assert(!trie.contains("ANCT")) - assert(!trie.contains("ACTN")) - } - - test("building a trie fails if we have different length keys") { - intercept[AssertionError] { - DNAPrefixTrie(Map("ACTCGA" -> 1.2, - "ACTCA" -> 1.1)) - } - } - - test("insert keys into a trie, and retrieve them") { - val trie = DNAPrefixTrie(Map("ACCTA" -> 1, - "ACTGA" -> 2, - "CCTCA" -> 3)) - - assert(trie.size === 3) - - // check for values inserted into trie - assert(trie.contains("ACCTA")) - assert(trie.get("ACCTA") === 1) - assert(trie.contains("ACTGA")) - assert(trie.get("ACTGA") === 2) - assert(trie.contains("CCTCA")) - assert(trie.get("CCTCA") === 3) - } - - val sampleTrie = DNAPrefixTrie(Map( - "AACACT" -> 1, - "AACACC" -> 4, - "ATGGTC" -> 2, - "CACTGC" -> 5, - "CCTCGA" -> 4, - "GGCGTC" -> 6, - "TCCTCG" -> 4, - "TTCTTC" -> 2)) - - test("perform a wildkey search") { - val foundKVs = sampleTrie.search("A****C") - - assert(foundKVs.size === 2) - assert(foundKVs("AACACC") === 4) - assert(foundKVs("ATGGTC") === 2) - } - - test("perform a prefix search") { - val foundKVs = sampleTrie.prefixSearch("AACA") - - assert(foundKVs.size === 2) - assert(foundKVs("AACACT") === 1) - assert(foundKVs("AACACC") === 4) - } - - test("perform a suffix search") { - val foundKVs = sampleTrie.suffixSearch("TC") - - assert(foundKVs.size === 3) - assert(foundKVs("ATGGTC") === 2) - assert(foundKVs("GGCGTC") === 6) - assert(foundKVs("TTCTTC") === 2) - } - - test("test getters") { - // test on a key that is in the trie - assert(sampleTrie.get("AACACT") === 1) - assert(sampleTrie.getOrElse("AACACT", 4) === 1) - assert(sampleTrie.getIfExists("AACACT").isDefined) - assert(sampleTrie.getIfExists("AACACT").get === 1) - - // test on a key that is not in the trie - intercept[IllegalArgumentException] { - sampleTrie.get("AAGACT") - } - assert(sampleTrie.getOrElse("AAGACT", 4) === 4) - assert(sampleTrie.getIfExists("AAGACT").isEmpty) - } -} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/AlphabetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/AlphabetSuite.scala new file mode 100644 index 0000000000..dd96e030f8 --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/AlphabetSuite.scala @@ -0,0 +1,106 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.models + +import org.bdgenomics.adam.util.ADAMFunSuite + +/** + * Created by bryan on 4/18/15. + * + * Tests the alphabet class + */ +class AlphabetSuite extends ADAMFunSuite { + + val testSymbols = Seq( + Symbol('a', 'z'), + Symbol('b', 'y'), + Symbol('c', 'x') + ) + + val csAlpha = new Alphabet { + override val caseSensitive = true + override val symbols = testSymbols + } + + test("test size of a case-sensitive alphabet") { + assert(3 == csAlpha.size) + } + + test("test apply of a case-sensitive alphabet") { + assert(Symbol('b', 'y') == csAlpha('b')) + val e = intercept[NoSuchElementException] { + csAlpha('B') + } + assert(e.getMessage == "key not found: B") + } + + test("test reverse complement of a case-sensitive alphabet") { + assert("xyz" == csAlpha.reverseComplement("abc")) + assert("CBA" == csAlpha.reverseComplement("ABC")) + assert("" == csAlpha.reverseComplement("")) + } + + test("test exact reverse complement of a case-sensitive alphabet") { + assert("zzyyxx" == csAlpha.reverseComplement("ccbbaa")) + assert("" == csAlpha.reverseComplement("")) + + val e = intercept[IllegalArgumentException] { + csAlpha.reverseComplementExact("ABC") + } + assert(e.getMessage == "Character A not found in alphabet.") + } + + val ciAlpha = new Alphabet { + override val caseSensitive = false + override val symbols = testSymbols + } + + test("test size of a case-insensitive alphabet") { + assert(3 == ciAlpha.size) + } + + test("test apply of a case-insensitive alphabet") { + assert(Symbol('b', 'y') == ciAlpha('b')) + assert(Symbol('b', 'y') == ciAlpha('B')) + } + + test("test reverse complement of a case-insensitive alphabet") { + assert("xyz" == ciAlpha.reverseComplement("abc")) + assert("xyz" == ciAlpha.reverseComplement("ABC")) + assert("" == ciAlpha.reverseComplement("")) + } + + test("test exact reverse complement of a case-insensitive alphabet") { + assert("zzyyxx" == ciAlpha.reverseComplement("ccbbaa")) + assert("zzyyxx" == ciAlpha.reverseComplement("cCbBaA")) + assert("" == ciAlpha.reverseComplement("")) + + val e = intercept[IllegalArgumentException] { + ciAlpha.reverseComplementExact("xxx") + } + assert(e.getMessage == "Character x not found in alphabet.") + } + + test("DNA alphabet") { + assert(4 == Alphabet.dna.size) + assert("CGCGATAT" == Alphabet.dna.reverseComplement("atatcgcg")) + assert("CGxATAT" == Alphabet.dna.reverseComplement("ATATxcg")) + } + +} + diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceUtilsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceUtilsSuite.scala new file mode 100644 index 0000000000..9f1ca9eb1a --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/models/ReferenceUtilsSuite.scala @@ -0,0 +1,61 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.models + +import org.bdgenomics.adam.models.ReferenceUtils._ +import org.bdgenomics.adam.util.ADAMFunSuite + +class ReferenceUtilsSuite extends ADAMFunSuite { + + test("unionReferenceSet: empty") { + val regions = Seq() + + assert(unionReferenceSet(regions) === regions) + } + + test("unionReferenceSet: one region") { + val regions = Seq(ReferenceRegion("1", 3, 7)) + + assert(unionReferenceSet(regions) === regions) + } + + test("unionReferenceSet: multiple regions on one contig, all overlap") { + val regions = Seq(ReferenceRegion("1", 3, 7), + ReferenceRegion("1", 1, 5), + ReferenceRegion("1", 6, 10)) + + assert(unionReferenceSet(regions) === Seq(ReferenceRegion("1", 1, 10))) + } + + test("unionReferenceSet: multiple regions on one contig, some overlap") { + val regions = Seq(ReferenceRegion("1", 3, 7), + ReferenceRegion("1", 1, 5), + ReferenceRegion("1", 8, 10)) + + assert(unionReferenceSet(regions) === Seq(ReferenceRegion("1", 8, 10), ReferenceRegion("1", 1, 7))) + } + + test("unionReferenceSet: multiple regions on multiple contigs") { + val regions = Seq(ReferenceRegion("1", 3, 7), + ReferenceRegion("1", 1, 5), + ReferenceRegion("2", 4, 8)) + + assert(unionReferenceSet(regions) === Seq(ReferenceRegion("2", 4, 8), ReferenceRegion("1", 1, 7))) + } + +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/correction/ErrorCorrectionSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/correction/ErrorCorrectionSuite.scala deleted file mode 100644 index 9d62a81a78..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/correction/ErrorCorrectionSuite.scala +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd.read.correction - -import org.bdgenomics.formats.avro.AlignmentRecord -import org.scalatest.FunSuite -import scala.math.abs - -class ErrorCorrectionSuite extends FunSuite { - - val ec = new ErrorCorrection - - def fpCompare(a: Double, b: Double, epsilon: Double = 1e-3): Boolean = abs(a - b) < epsilon - - test("cut a short read into qmers") { - val read = AlignmentRecord.newBuilder - .setSequence("ACTCATG") - .setQual("??;957:") - .build() - - val qmers = ec.readToQmers(read, 3).toMap - - // check that the qmer count is correct - assert(qmers.size === 5) - - // find our qmers and check their values - assert(fpCompare(qmers("ACT"), 0.995)) - assert(fpCompare(qmers("CTC"), 0.992)) - assert(fpCompare(qmers("TCA"), 0.983)) - assert(fpCompare(qmers("CAT"), 0.979)) - assert(fpCompare(qmers("ATG"), 0.980)) - } -} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/correction/TrimReadsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/correction/TrimReadsSuite.scala deleted file mode 100644 index 7270f4c8a6..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/correction/TrimReadsSuite.scala +++ /dev/null @@ -1,175 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd.read.correction - -import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.util.ADAMFunSuite -import org.bdgenomics.formats.avro.AlignmentRecord - -class TrimReadsSuite extends ADAMFunSuite { - - val ec = new TrimReads - - def makeRead(sequence: String, qual: String): AlignmentRecord = { - AlignmentRecord.newBuilder - .setSequence(sequence) - .setQual(qual) - .build - } - - test("trim a few md tags") { - assert(ec.trimMdTag("10", 2, 0) === "8") - assert(ec.trimMdTag("2A10", 4, 0) === "9") - assert(ec.trimMdTag("0C10C1", 1, 2) === "10") - assert(ec.trimMdTag("1^AC3", 2, 0) === "2") - assert(ec.trimMdTag("3^AC1", 0, 2) === "2") - assert(ec.trimMdTag("2A0C0", 3, 0) === "0C0") - assert(ec.trimMdTag("2A0C0", 0, 1) === "2A0") - } - - test("trim a few cigars that just have clipping and matches") { - // trim a single element incompletely - assert(ec.trimCigar("2S10M", 1, 0, 0L, 10L) === ("1H1S10M", 0L, 10L)) - assert(ec.trimCigar("10M3S", 0, 2, 0L, 10L) === ("10M1S2H", 0L, 10L)) - assert(ec.trimCigar("2S10M3S", 1, 2, 0L, 10L) === ("1H1S10M1S2H", 0L, 10L)) - - // trim a single element completely - assert(ec.trimCigar("2S10M", 2, 0, 0L, 10L) === ("2H10M", 0L, 10L)) - assert(ec.trimCigar("10M3S", 0, 3, 0L, 10L) === ("10M3H", 0L, 10L)) - assert(ec.trimCigar("2S10M3S", 2, 3, 0L, 10L) === ("2H10M3H", 0L, 10L)) - - // trim into multiple elements - assert(ec.trimCigar("2S10M", 3, 0, 0L, 10L) === ("3H9M", 1L, 10L)) - assert(ec.trimCigar("10M3S", 0, 4, 0L, 10L) === ("9M4H", 0L, 9L)) - assert(ec.trimCigar("2S10M3S", 3, 4, 0L, 10L) === ("3H8M4H", 1L, 9L)) - } - - test("trim cigars with indels") { - // trim through a deletion - assert(ec.trimCigar("2S2M2D4M", 5, 0, 0L, 8L) === ("5H3M", 5L, 8L)) - assert(ec.trimCigar("4M1D1M", 0, 3, 0L, 6L) === ("2M3H", 0L, 2L)) - assert(ec.trimCigar("2S2M2N4M", 5, 0, 0L, 8L) === ("5H3M", 5L, 8L)) - assert(ec.trimCigar("4M1N1M", 0, 3, 0L, 6L) === ("2M3H", 0L, 2L)) - - // trim into an insert - assert(ec.trimCigar("2M2I10M", 3, 0, 0L, 12L) === ("3H1I10M", 2L, 12L)) - assert(ec.trimCigar("10M3I1M", 0, 3, 0L, 11L) === ("10M1I3H", 0L, 10L)) - } - - test("trim a few reads") { - // trim from the front only, read without cigar - val read1 = AlignmentRecord.newBuilder - .setSequence("ACTCGCCCACTCA") - .setQual("##/9:::::::::") - .build - val trimmedRead1 = ec.trimRead(read1, 2, 0) - - assert(trimmedRead1.getSequence.toString === "TCGCCCACTCA") - assert(trimmedRead1.getQual.toString === "/9:::::::::") - - // trim from both ends, read with cigar - val read2 = AlignmentRecord.newBuilder - .setSequence("ACTCGCCCACTCAAA") - .setQual("##/9:::::::::##") - .setCigar("2S11M2S") - .setStart(5L) - .setEnd(16L) - .build - val trimmedRead2 = ec.trimRead(read2, 2, 2) - - assert(trimmedRead2.getSequence.toString === "TCGCCCACTCA") - assert(trimmedRead2.getQual.toString === "/9:::::::::") - assert(trimmedRead2.getStart === 5L) - assert(trimmedRead2.getEnd === 16L) - assert(trimmedRead2.getCigar.toString === "2H11M2H") - - // trim from both ends, read with cigar - val read3 = AlignmentRecord.newBuilder - .setSequence("ACTCGCCCACTCAAA") - .setQual("##/9:::::::::##") - .setCigar("15M") - .setStart(5L) - .setEnd(20L) - .build - val trimmedRead3 = ec.trimRead(read3, 4, 3) - - assert(trimmedRead3.getSequence.toString === "GCCCACTC") - assert(trimmedRead3.getQual.toString === "::::::::") - assert(trimmedRead3.getStart === 9L) - assert(trimmedRead3.getEnd === 17L) - assert(trimmedRead3.getCigar.toString === "4H8M3H") - } - - sparkTest("correctly trim an RDD of reads") { - // put the same sequence at start and end of reads - val reads = Seq(makeRead("AACTCGACGCTTT", "##::::::::$$$"), - makeRead("AACTCCCTGCTTT", "##::::::::$$$"), - makeRead("AACTCATAGCTTT", "##::::::::$$$"), - makeRead("AACTCCCAGCTTT", "##::::::::$$$"), - makeRead("AACTCGGAGCTTT", "##::::::::$$$")) - val rdd = sc.parallelize(reads) - - val trimFront = TrimReads(rdd, 2, 0) - - trimFront.collect().foreach(r => { - assert(r.getSequence.length === 11) - assert(r.getQual.length === 11) - assert(r.getSequence.toString.startsWith("CT")) - assert(r.getSequence.toString.endsWith("TTT")) - assert(r.getQual.toString.startsWith("::")) - assert(r.getQual.toString.endsWith("$$$")) - assert(r.getBasesTrimmedFromStart === 2) - assert(r.getBasesTrimmedFromEnd === 0) - }) - - val trimEnd = TrimReads(trimFront, 0, 3) - - trimEnd.collect().foreach(r => { - assert(r.getSequence.length === 8, r.getSequence) - assert(r.getQual.length === 8) - assert(r.getSequence.toString.startsWith("CT")) - assert(r.getSequence.toString.endsWith("GC")) - assert(r.getQual.toString === "::::::::") - assert(r.getBasesTrimmedFromStart === 2) - assert(r.getBasesTrimmedFromEnd === 3) - }) - } - - sparkTest("adaptively trim reads") { - val readsFilepath = ClassLoader.getSystemClassLoader.getResource("bqsr1.sam").getFile - val reads: RDD[AlignmentRecord] = sc.loadAlignments(readsFilepath) - - // put all reads into a single read group - val readsSingleRG = reads.map(read => { - AlignmentRecord.newBuilder(read) - .setRecordGroupName("group0") - .build() - }) - - // trim reads - use phred Q10 as threshold - val trimmed = TrimReads(readsSingleRG, 10) - - // we should trim the first and last 5 bases off all reads - trimmed.collect().foreach(r => { - assert(r.getBasesTrimmedFromStart === 5) - assert(r.getBasesTrimmedFromEnd === 5) - }) - } - -} diff --git a/bin/compute-adam-jars.sh b/bin/compute-adam-jars.sh index 0591c4cb2c..5467045de8 100755 --- a/bin/compute-adam-jars.sh +++ b/bin/compute-adam-jars.sh @@ -24,6 +24,6 @@ CLASSPATH=$("$ADAM_REPO"/bin/compute-adam-classpath.sh) # list of jars to ship with spark; trim off the first and last from the CLASSPATH # TODO: brittle? assumes appassembler always puts the $BASE/etc first and the CLI jar last -ADAM_JARS=$(echo "$CLASSPATH" | tr ":" "," | cut -d "," -f 2- | rev | cut -d "," -f 2- | rev) +ADAM_JARS=$(echo "$CLASSPATH" | tr ":" "," | cut -d "," -f 2-) echo "$ADAM_JARS"