diff --git a/deeptools/cm.py b/deeptools/cm.py index 47bcf16285..14da876c29 100644 --- a/deeptools/cm.py +++ b/deeptools/cm.py @@ -30,8 +30,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from matplotlib import colors, colormaps as mpl_cm - +from matplotlib import colors _rocket_lut = [ [0.01060815, 0.01808215, 0.10018654], @@ -289,7 +288,7 @@ [0.97912374, 0.90313207, 0.83979337], [0.979891, 0.90894778, 0.84827858], [0.98067764, 0.91476465, 0.85676611], - [0.98137749, 0.92061729, 0.86536915] + [0.98137749, 0.92061729, 0.86536915], ] @@ -549,7 +548,7 @@ [0.84857662, 0.9498573, 0.8776059], [0.8564431, 0.95309792, 0.88414253], [0.86429066, 0.95635719, 0.89067759], - [0.87218969, 0.95960708, 0.89725384] + [0.87218969, 0.95960708, 0.89725384], ] @@ -809,7 +808,7 @@ [0.67000176, 0.23511902, 0.24650278], [0.66693423, 0.22859879, 0.24124404], [0.6638441, 0.22201742, 0.2359961], - [0.66080672, 0.21526712, 0.23069468] + [0.66080672, 0.21526712, 0.23069468], ] @@ -1069,13 +1068,25 @@ [0.9932672, 0.79848979, 0.63231691], [0.99535958, 0.80926704, 0.64687278], [0.99740544, 0.82008078, 0.66150571], - [0.9992197, 0.83100723, 0.6764127] + [0.9992197, 0.83100723, 0.6764127], ] _luts = [_rocket_lut, _mako_lut, _vlag_lut, _icefire_lut] _names = ["rocket", "mako", "vlag", "icefire"] + +def mycmregister(name, cmap): + import matplotlib + + if hasattr(matplotlib.cm, "register_cmap"): + matplotlib.cm.register_cmap(name, cmap) + elif hasattr(matplotlib.colors, "register_cmap"): + matplotlib.colors.register_cmap(name, cmap) + else: + matplotlib.colormaps.register(cmap, name=name) + + for _lut, _name in zip(_luts, _names): _cmap = colors.ListedColormap(_lut, _name) @@ -1084,5 +1095,5 @@ _cmap_r = colors.ListedColormap(_lut[::-1], _name + "_r") locals()[_name + "_r"] = _cmap_r - mpl_cm.register(_cmap, name=_name) - mpl_cm.register(_cmap_r, name=_name + "_r") + mycmregister(_name, _cmap) + mycmregister(_name + "_r", _cmap_r) diff --git a/deeptools/computeGCBias.py b/deeptools/computeGCBias.py index f261a9fc14..7209b51007 100755 --- a/deeptools/computeGCBias.py +++ b/deeptools/computeGCBias.py @@ -17,7 +17,7 @@ from deeptools import bamHandler debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): @@ -26,15 +26,16 @@ def parse_arguments(args=None): parser = argparse.ArgumentParser( parents=[requiredArgs, parentParser], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='Computes the GC-bias using Benjamini\'s method ' - '[Benjamini & Speed (2012). Nucleic Acids Research, 40(10). doi: 10.1093/nar/gks001]. ' - 'The GC-bias is visualized and the resulting table can be used to' - 'correct the bias with `correctGCBias`.', - usage='computeGCBias ' - '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit -l 200 --GCbiasFrequenciesFile freq.txt\n' - 'help: computeGCBias -h / computeGCBias --help', - conflict_handler='resolve', - add_help=False) + description="Computes the GC-bias using Benjamini's method " + "[Benjamini & Speed (2012). Nucleic Acids Research, 40(10). doi: 10.1093/nar/gks001]. " + "The GC-bias is visualized and the resulting table can be used to" + "correct the bias with `correctGCBias`.", + usage="computeGCBias " + "-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit -l 200 --GCbiasFrequenciesFile freq.txt\n" + "help: computeGCBias -h / computeGCBias --help", + conflict_handler="resolve", + add_help=False, + ) return parser @@ -42,96 +43,117 @@ def parse_arguments(args=None): def getRequiredArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') - - required.add_argument('--bamfile', '-b', - metavar='bam file', - help='Sorted BAM file. ', - required=True) - - required.add_argument('--effectiveGenomeSize', - help='The effective genome size is the portion ' - 'of the genome that is mappable. Large fractions of ' - 'the genome are stretches of NNNN that should be ' - 'discarded. Also, if repetitive regions were not ' - 'included in the mapping of reads, the effective ' - 'genome size needs to be adjusted accordingly. ' - 'A table of values is available here: ' - 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .', - default=None, - type=int, - required=True) - - required.add_argument('--genome', '-g', - help='Genome in two bit format. Most genomes can be ' - 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ ' - 'Search for the .2bit ending. Otherwise, fasta ' - 'files can be converted to 2bit using the UCSC ' - 'programm called faToTwoBit available for different ' - 'plattforms at ' - 'http://hgdownload.cse.ucsc.edu/admin/exe/', - metavar='2bit FILE', - required=True) - - required.add_argument('--GCbiasFrequenciesFile', '-freq', '-o', - help='Path to save the file containing ' - 'the observed and expected read frequencies per %%GC-' - 'content. This file is needed to run the ' - 'correctGCBias tool. This is a text file.', - type=argparse.FileType('w'), - metavar='FILE', - required=True) + required = parser.add_argument_group("Required arguments") + + required.add_argument( + "--bamfile", "-b", metavar="bam file", help="Sorted BAM file. ", required=True + ) + + required.add_argument( + "--effectiveGenomeSize", + help="The effective genome size is the portion " + "of the genome that is mappable. Large fractions of " + "the genome are stretches of NNNN that should be " + "discarded. Also, if repetitive regions were not " + "included in the mapping of reads, the effective " + "genome size needs to be adjusted accordingly. " + "A table of values is available here: " + "http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .", + default=None, + type=int, + required=True, + ) + + required.add_argument( + "--genome", + "-g", + help="Genome in two bit format. Most genomes can be " + "found here: http://hgdownload.cse.ucsc.edu/gbdb/ " + "Search for the .2bit ending. Otherwise, fasta " + "files can be converted to 2bit using the UCSC " + "programm called faToTwoBit available for different " + "plattforms at " + "http://hgdownload.cse.ucsc.edu/admin/exe/", + metavar="2bit FILE", + required=True, + ) + + required.add_argument( + "--GCbiasFrequenciesFile", + "-freq", + "-o", + help="Path to save the file containing " + "the observed and expected read frequencies per %%GC-" + "content. This file is needed to run the " + "correctGCBias tool. This is a text file.", + type=argparse.FileType("w"), + metavar="FILE", + required=True, + ) # define the optional arguments - optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--fragmentLength', '-l', - help='Fragment length used for the sequencing. If ' - 'paired-end reads are used, the fragment length is ' - 'computed based from the bam file', - type=int) - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - - optional.add_argument('--sampleSize', - default=5e7, - help='Number of sampling points to be considered. (Default: %(default)s)', - type=int) - - optional.add_argument('--extraSampling', - help='BED file containing genomic regions for which ' - 'extra sampling is required because they are ' - 'underrepresented in the genome.', - type=argparse.FileType('r'), - metavar='BED file') - - plot = parser.add_argument_group('Diagnostic plot options') - - plot.add_argument('--biasPlot', - metavar='FILE NAME', - help='If given, a diagnostic image summarizing ' - 'the GC-bias will be saved.') - - plot.add_argument('--plotFileFormat', - metavar='', - help='image format type. If given, this ' - 'option overrides the ' - 'image format based on the plotFile ending. ' - 'The available options are: "png", ' - '"eps", "pdf", "plotly" and "svg"', - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) - - plot.add_argument('--regionSize', - metavar='INT', - type=int, - default=300, - help='To plot the reads per %%GC over a region' - 'the size of the region is required. By default, ' - 'the bin size is set to 300 bases, which is close to the ' - 'standard fragment size for Illumina machines. However, ' - 'if the depth of sequencing is low, a larger bin size ' - 'will be required, otherwise many bins will not ' - 'overlap with any read (Default: %(default)s)') + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--fragmentLength", + "-l", + help="Fragment length used for the sequencing. If " + "paired-end reads are used, the fragment length is " + "computed based from the bam file", + type=int, + ) + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + + optional.add_argument( + "--sampleSize", + default=5e7, + help="Number of sampling points to be considered. (Default: %(default)s)", + type=int, + ) + + optional.add_argument( + "--extraSampling", + help="BED file containing genomic regions for which " + "extra sampling is required because they are " + "underrepresented in the genome.", + type=argparse.FileType("r"), + metavar="BED file", + ) + + plot = parser.add_argument_group("Diagnostic plot options") + + plot.add_argument( + "--biasPlot", + metavar="FILE NAME", + help="If given, a diagnostic image summarizing " "the GC-bias will be saved.", + ) + + plot.add_argument( + "--plotFileFormat", + metavar="", + help="image format type. If given, this " + "option overrides the " + "image format based on the plotFile ending. " + 'The available options are: "png", ' + '"eps", "pdf", "plotly" and "svg"', + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + plot.add_argument( + "--regionSize", + metavar="INT", + type=int, + default=300, + help="To plot the reads per %%GC over a region" + "the size of the region is required. By default, " + "the bin size is set to 300 bases, which is close to the " + "standard fragment size for Illumina machines. However, " + "if the depth of sequencing is low, a larger bin size " + "will be required, otherwise many bins will not " + "overlap with any read (Default: %(default)s)", + ) return parser @@ -149,13 +171,13 @@ def getPositionsToSample(chrom, start, end, stepSize): """ positions_to_sample = np.arange(start, end, stepSize) - if global_vars['filter_out']: - filter_out_tree = GTF(global_vars['filter_out']) + if global_vars["filter_out"]: + filter_out_tree = GTF(global_vars["filter_out"]) else: filter_out_tree = None - if global_vars['extra_sampling_file']: - extra_tree = GTF(global_vars['extra_sampling_file']) + if global_vars["extra_sampling_file"]: + extra_tree = GTF(global_vars["extra_sampling_file"]) else: extra_tree = None @@ -168,14 +190,17 @@ def getPositionsToSample(chrom, start, end, stepSize): if len(extra_match) > 0: for intval in extra_match: - positions_to_sample = np.append(positions_to_sample, - list(range(intval[0], intval[1], stepSize))) + positions_to_sample = np.append( + positions_to_sample, list(range(intval[0], intval[1], stepSize)) + ) # remove duplicates positions_to_sample = np.unique(np.sort(positions_to_sample)) if debug: - print("sampling increased to {} from {}".format( - len(positions_to_sample), - orig_len)) + print( + "sampling increased to {} from {}".format( + len(positions_to_sample), orig_len + ) + ) # skip regions that are filtered out if filter_out_tree: @@ -186,8 +211,10 @@ def getPositionsToSample(chrom, start, end, stepSize): if len(out_match) > 0: for intval in out_match: - positions_to_sample = \ - positions_to_sample[(positions_to_sample < intval[0]) | (positions_to_sample >= intval[1])] + positions_to_sample = positions_to_sample[ + (positions_to_sample < intval[0]) | + (positions_to_sample >= intval[1]) + ] return positions_to_sample @@ -195,21 +222,20 @@ def countReadsPerGC_wrapper(args): return countReadsPerGC_worker(*args) -def countReadsPerGC_worker(chromNameBam, - start, end, stepSize, regionSize, - chrNameBamToBit, verbose=False): +def countReadsPerGC_worker( + chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False +): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] - tbit = py2bit.open(global_vars['2bit']) - bam = bamHandler.openBam(global_vars['bam']) + tbit = py2bit.open(global_vars["2bit"]) + bam = bamHandler.openBam(global_vars["bam"]) c = 1 sub_reads_per_gc = [] - positions_to_sample = getPositionsToSample(chromNameBit, - start, end, stepSize) + positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in range(len(positions_to_sample)): i = positions_to_sample[index] @@ -235,10 +261,10 @@ def tabulateGCcontent_wrapper(args): return tabulateGCcontent_worker(*args) -def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, - fragmentLength, - chrNameBamToBit, verbose=False): - r""" given genome regions, the GC content of the genome is tabulated for +def tabulateGCcontent_worker( + chromNameBam, start, end, stepSize, fragmentLength, chrNameBamToBit, verbose=False +): + r"""given genome regions, the GC content of the genome is tabulated for fragments of length 'fragmentLength' each 'stepSize' positions. >>> test = Tester() @@ -300,20 +326,20 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, # indicate the gc content. The values inside the # array are counts. Thus, if N_gc[10] = 3, that means # that 3 regions have a gc_content of 10. - subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') - subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') + subN_gc = np.zeros(fragmentLength["median"] + 1, dtype="int") + subF_gc = np.zeros(fragmentLength["median"] + 1, dtype="int") - tbit = py2bit.open(global_vars['2bit']) - bam = bamHandler.openBam(global_vars['bam']) + tbit = py2bit.open(global_vars["2bit"]) + bam = bamHandler.openBam(global_vars["bam"]) peak = 0 startTime = time.time() if verbose: - print("[{:.3f}] computing positions to " - "sample".format(time.time() - startTime)) + print( + "[{:.3f}] computing positions to " "sample".format(time.time() - startTime) + ) - positions_to_sample = getPositionsToSample(chromNameBit, - start, end, stepSize) + positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) read_counts = [] # Optimize IO. @@ -332,16 +358,18 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, if verbose: print("[{:.3f}] caching reads".format(time.time() - startTime)) - counts = np.bincount([r.pos - start_pos - for r in bam.fetch(chromNameBam, start_pos, - end_pos + 1) - if not r.is_reverse and not r.is_unmapped and r.pos >= start_pos], - minlength=end_pos - start_pos + 2) + counts = np.bincount( + [ + r.pos - start_pos + for r in bam.fetch(chromNameBam, start_pos, end_pos + 1) + if not r.is_reverse and not r.is_unmapped and r.pos >= start_pos + ], + minlength=end_pos - start_pos + 2, + ) read_counts = counts[positions_to_sample - min(positions_to_sample)] if verbose: - print("[{:.3f}] finish caching reads.".format( - time.time() - startTime)) + print("[{:.3f}] finish caching reads.".format(time.time() - startTime)) countTime = time.time() @@ -349,11 +377,17 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if the end of the chromosome is reached - if i + fragmentLength['median'] > tbit.chroms(chromNameBit): + if i + fragmentLength["median"] > tbit.chroms(chromNameBit): break try: - gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False) + gc = getGC_content( + tbit, + chromNameBit, + int(i), + int(i + fragmentLength["median"]), + fraction=False, + ) except Exception as detail: if verbose: print(detail) @@ -363,12 +397,17 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, # count all reads at position 'i' if len(read_counts) == 0: # case when no cache was done - num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1) - if x.is_reverse is False and x.pos == i]) + num_reads = len( + [ + x.pos + for x in bam.fetch(chromNameBam, i, i + 1) + if x.is_reverse is False and x.pos == i + ] + ) else: num_reads = read_counts[index] - if num_reads >= global_vars['max_reads']: + if num_reads >= global_vars["max_reads"]: peak += 1 continue @@ -376,27 +415,58 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, if verbose: if index % 50000 == 0: endTime = time.time() - print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % - (multiprocessing.current_process().name, - index, index / (endTime - countTime), - chromNameBit, start, end, stepSize)) + print( + "%s processing %d (%.1f per sec) @ %s:%s-%s %s" + % ( + multiprocessing.current_process().name, + index, + index / (endTime - countTime), + chromNameBit, + start, + end, + stepSize, + ) + ) c += 1 if verbose: endTime = time.time() - print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % - (multiprocessing.current_process().name, - index, index / (endTime - countTime), - chromNameBit, start, end, stepSize)) - print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name, - (endTime - startTime), chromNameBit, start, end, stepSize)) + print( + "%s processing %d (%.1f per sec) @ %s:%s-%s %s" + % ( + multiprocessing.current_process().name, + index, + index / (endTime - countTime), + chromNameBit, + start, + end, + stepSize, + ) + ) + print( + "%s total time %.1f @ %s:%s-%s %s" + % ( + multiprocessing.current_process().name, + (endTime - startTime), + chromNameBit, + start, + end, + stepSize, + ) + ) return subN_gc, subF_gc -def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, - chromSizes, numberOfProcessors=None, verbose=False, - region=None): +def tabulateGCcontent( + fragmentLength, + chrNameBitToBam, + stepSize, + chromSizes, + numberOfProcessors=None, + verbose=False, + region=None, +): r""" Subdivides the genome or the reads into chunks to be analyzed in parallel using several processors. This codes handles the creation of @@ -418,20 +488,18 @@ def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, [ 0. , 0. , 1. ], [ 0. , 0. , 1. ]]) """ - global global_vars - chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) - chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) + chunkSize = int(min(2e6, 4e5 / global_vars["reads_per_bp"])) chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())] - imap_res = mapReduce.mapReduce((stepSize, - fragmentLength, chrNameBamToBit, - verbose), - tabulateGCcontent_wrapper, - chromSizes, - genomeChunkLength=chunkSize, - numberOfProcessors=numberOfProcessors, - region=region) + imap_res = mapReduce.mapReduce( + (stepSize, fragmentLength, chrNameBamToBit, verbose), + tabulateGCcontent_wrapper, + chromSizes, + genomeChunkLength=chunkSize, + numberOfProcessors=numberOfProcessors, + region=region, + ) for subN_gc, subF_gc in imap_res: try: @@ -442,20 +510,31 @@ def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, N_gc = subN_gc if sum(F_gc) == 0: - sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter") + sys.exit( + "No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter" + ) scaling = float(sum(N_gc)) / float(sum(F_gc)) - R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling - if N_gc[x] and F_gc[x] > 0 else 1 - for x in range(len(F_gc))]) + R_gc = np.array( + [ + float(F_gc[x]) / N_gc[x] * scaling if N_gc[x] and F_gc[x] > 0 else 1 + for x in range(len(F_gc)) + ] + ) data = np.transpose(np.vstack((F_gc, N_gc, R_gc))) return data -def countReadsPerGC(regionSize, chrNameBitToBam, stepSize, - chromSizes, numberOfProcessors=None, verbose=False, - region=None): +def countReadsPerGC( + regionSize, + chrNameBitToBam, + stepSize, + chromSizes, + numberOfProcessors=None, + verbose=False, + region=None, +): r""" Computes for a region of size regionSize, the GC of the region and the number of reads that overlap it. @@ -469,19 +548,17 @@ def countReadsPerGC(regionSize, chrNameBitToBam, stepSize, [134. , 0.43666667], [134. , 0.44 ]]) """ - global global_vars - chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) - chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) + chunkSize = int(min(2e6, 4e5 / global_vars["reads_per_bp"])) - imap_res = mapReduce.mapReduce((stepSize, - regionSize, chrNameBamToBit, - verbose), - countReadsPerGC_wrapper, - chromSizes, - genomeChunkLength=chunkSize, - numberOfProcessors=numberOfProcessors, - region=region) + imap_res = mapReduce.mapReduce( + (stepSize, regionSize, chrNameBamToBit, verbose), + countReadsPerGC_wrapper, + chromSizes, + genomeChunkLength=chunkSize, + numberOfProcessors=numberOfProcessors, + region=region, + ) reads_per_gc = [] for sub_reads_per_gc in imap_res: @@ -507,7 +584,7 @@ def smooth(x, window_len=3): if i < half_width or i + half_width + 1 > len(x): continue else: - y[i] = np.mean(x[i - half_width:i + half_width + 1]) + y[i] = np.mean(x[i - half_width: i + half_width + 1]) # clip low values, this avoid problems with zeros return y @@ -539,14 +616,44 @@ def plotlyGCbias(file_name, frequencies, reads_per_gc, region_size): import matplotlib.cbook as cbook fig = go.Figure() - fig['layout']['xaxis1'] = dict(domain=[0.0, 1.0], anchor="y1", title="GC fraction") - fig['layout']['yaxis1'] = dict(domain=[0.55, 1.0], anchor="x1", title="Number of reads") - fig['layout']['xaxis2'] = dict(domain=[0.0, 1.0], anchor="y2", title="GC fraction", range=[0.2, 0.7]) - fig['layout']['yaxis2'] = dict(domain=[0.0, 0.45], anchor="x2", title="log2(observed/expected)") + fig["layout"]["xaxis1"] = dict(domain=[0.0, 1.0], anchor="y1", title="GC fraction") + fig["layout"]["yaxis1"] = dict( + domain=[0.55, 1.0], anchor="x1", title="Number of reads" + ) + fig["layout"]["xaxis2"] = dict( + domain=[0.0, 1.0], anchor="y2", title="GC fraction", range=[0.2, 0.7] + ) + fig["layout"]["yaxis2"] = dict( + domain=[0.0, 0.45], anchor="x2", title="log2(observed/expected)" + ) text = "reads per {} base region".format(region_size) - annos = [{'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 1.0, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False}] + annos = [ + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": text, + "y": 1.0, + "x": 0.5, + "font": {"size": 16}, + "showarrow": False, + } + ] text = "normalized observed/expected read counts" - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 0.5, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False}) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": text, + "y": 0.5, + "x": 0.5, + "font": {"size": 16}, + "showarrow": False, + } + ) # prepare data for boxplot reads, GC = reads_per_gc.T @@ -559,29 +666,58 @@ def plotlyGCbias(file_name, frequencies, reads_per_gc, region_size): bins = [] for b in reads_per_gc: s = cbook.boxplot_stats(b)[0] - bins.append([s['whislo'], s['q1'], s['q1'], s['med'], s['med'], s['med'], s['q3'], s['q3'], s['whishi']]) + bins.append( + [ + s["whislo"], + s["q1"], + s["q1"], + s["med"], + s["med"], + s["med"], + s["q3"], + s["q3"], + s["whishi"], + ] + ) data = [] # top plot for x, y in zip(bin_labels, bins): - trace = go.Box(x=x, y=y, xaxis='x1', yaxis='y1', boxpoints='outliers', showlegend=False, name="{}".format(x), line=dict(color='rgb(107,174,214)')) + trace = go.Box( + x=x, + y=y, + xaxis="x1", + yaxis="y1", + boxpoints="outliers", + showlegend=False, + name="{}".format(x), + line=dict(color="rgb(107,174,214)"), + ) data.append(trace) # bottom plot x = np.linspace(0, 1, frequencies.shape[0]) - trace = go.Scatter(x=x, y=np.log2(frequencies[:, 2]), xaxis='x2', yaxis='y2', showlegend=False, line=dict(color='rgb(107,174,214)')) + trace = go.Scatter( + x=x, + y=np.log2(frequencies[:, 2]), + xaxis="x2", + yaxis="y2", + showlegend=False, + line=dict(color="rgb(107,174,214)"), + ) data.append(trace) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=file_name, auto_open=False) def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=None): import matplotlib - matplotlib.use('Agg') - matplotlib.rcParams['pdf.fonttype'] = 42 - matplotlib.rcParams['svg.fonttype'] = 'none' + + matplotlib.use("Agg") + matplotlib.rcParams["pdf.fonttype"] = 42 + matplotlib.rcParams["svg.fonttype"] = "none" import matplotlib.pyplot as plt # prepare data for boxplot @@ -594,21 +730,20 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N title = "reads per regions of {} bp".format(region_size) fig = plt.figure(figsize=(6, 8)) ax1 = fig.add_subplot(211, title=title) - ax2 = fig.add_subplot(212, - title='normalized observed/expected read counts') + ax2 = fig.add_subplot(212, title="normalized observed/expected read counts") # make boxplot bp = ax1.boxplot(reads_per_gc, notch=0, patch_artist=True) - plt.setp(bp['boxes'], color='black', facecolor='LightGreen') - plt.setp(bp['medians'], color='black') - plt.setp(bp['whiskers'], color='black', linestyle='dashed') - plt.setp(bp['fliers'], marker='None') + plt.setp(bp["boxes"], color="black", facecolor="LightGreen") + plt.setp(bp["medians"], color="black") + plt.setp(bp["whiskers"], color="black", linestyle="dashed") + plt.setp(bp["fliers"], marker="None") # get the whisker that spands the most - y_max = np.nanmax([x.get_data()[1][1] for x in bp['whiskers']]) + y_max = np.nanmax([x.get_data()[1][1] for x in bp["whiskers"]]) ax1.set_ylim(0 - (y_max * 0.05), y_max * 1.05) - ax1.set_ylabel('Number of reads') - ax1.set_xlabel('GC fraction') + ax1.set_ylabel("Number of reads") + ax1.set_xlabel("GC fraction") xticks = [idx for idx, x in enumerate(bin_labels) if int(x * 100) % 10 == 0] @@ -617,12 +752,12 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N x = np.linspace(0, 1, frequencies.shape[0]) y = np.log2(frequencies[:, 2]) - ax2.plot(x, y, color='#8c96f0') - ax2.set_xlabel('GC fraction') - ax2.set_ylabel('log2ratio observed/expected') + ax2.plot(x, y, color="#8c96f0") + ax2.set_xlabel("GC fraction") + ax2.set_ylabel("log2ratio observed/expected") ax2.set_xlim(0.2, 0.7) - y_max = max(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1]) - y_min = min(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1]) + y_max = max(y[np.where(x >= 0.2)[0][0]: np.where(x <= 0.7)[0][-1] + 1]) + y_min = min(y[np.where(x >= 0.2)[0][0]: np.where(x <= 0.7)[0][-1] + 1]) if y_max > 0: y_max *= 1.1 else: @@ -633,7 +768,7 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N y_min *= 0.9 ax2.set_ylim(y_min, y_max) plt.tight_layout() - plt.savefig(file_name, bbox_inches='tight', dpi=100, format=image_format) + plt.savefig(file_name, bbox_inches="tight", dpi=100, format=image_format) plt.close() @@ -648,42 +783,49 @@ def main(args=None): global global_vars global_vars = {} - global_vars['2bit'] = args.genome - global_vars['bam'] = args.bamfile - global_vars['filter_out'] = args.blackListFileName - global_vars['extra_sampling_file'] = extra_sampling_file + global_vars["2bit"] = args.genome + global_vars["bam"] = args.bamfile + global_vars["filter_out"] = args.blackListFileName + global_vars["extra_sampling_file"] = extra_sampling_file - tbit = py2bit.open(global_vars['2bit']) - bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors) + tbit = py2bit.open(global_vars["2bit"]) + bam, mapped, unmapped, stats = bamHandler.openBam( + global_vars["bam"], returnStats=True, nThreads=args.numberOfProcessors + ) if args.fragmentLength: - fragment_len_dict = \ - {'median': args.fragmentLength} + fragment_len_dict = {"median": args.fragmentLength} else: - fragment_len_dict, __ = \ - get_read_and_fragment_length(args.bamfile, None, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + fragment_len_dict, __ = get_read_and_fragment_length( + args.bamfile, + None, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) if not fragment_len_dict: - print("\nPlease provide the fragment length used for the " - "sample preparation.\n") + print( + "\nPlease provide the fragment length used for the " + "sample preparation.\n" + ) exit(1) - fragment_len_dict = {'median': int(fragment_len_dict['median'])} + fragment_len_dict = {"median": int(fragment_len_dict["median"])} chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) - global_vars['genome_size'] = sum(tbit.chroms().values()) - global_vars['total_reads'] = mapped - global_vars['reads_per_bp'] = \ - float(global_vars['total_reads']) / args.effectiveGenomeSize + global_vars["genome_size"] = sum(tbit.chroms().values()) + global_vars["total_reads"] = mapped + global_vars["reads_per_bp"] = ( + float(global_vars["total_reads"]) / args.effectiveGenomeSize + ) confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] chromSizes = [x for x in chromSizes if x[0] in tbit.chroms()] # use poisson distribution to identify peaks that should be discarted. @@ -693,107 +835,123 @@ def main(args=None): # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. - global_vars['max_reads'] = poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) + global_vars["max_reads"] = poisson( + 4 * global_vars["reads_per_bp"] * fragment_len_dict["median"] + ).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 - global_vars['min_reads'] = poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) + global_vars["min_reads"] = poisson( + 0.25 * global_vars["reads_per_bp"] * fragment_len_dict["median"] + ).ppf(confidence_p_value) for key in global_vars: print("{}: {}".format(key, global_vars[key])) print("computing frequencies") # the GC of the genome is sampled each stepSize bp. - stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) + stepSize = max(int(global_vars["genome_size"] / args.sampleSize), 1) print("stepSize: {}".format(stepSize)) - data = tabulateGCcontent(fragment_len_dict, - chrNameBitToBam, stepSize, - chromSizes, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - region=args.region) + data = tabulateGCcontent( + fragment_len_dict, + chrNameBitToBam, + stepSize, + chromSizes, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + region=args.region, + ) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: - reads_per_gc = countReadsPerGC(args.regionSize, - chrNameBitToBam, stepSize * 10, - chromSizes, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - region=args.region) + reads_per_gc = countReadsPerGC( + args.regionSize, + chrNameBitToBam, + stepSize * 10, + chromSizes, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + region=args.region, + ) if args.plotFileFormat == "plotly": plotlyGCbias(args.biasPlot, data, reads_per_gc, args.regionSize) else: - plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat) + plotGCbias( + args.biasPlot, + data, + reads_per_gc, + args.regionSize, + image_format=args.plotFileFormat, + ) -class Tester(): +class Tester: def __init__(self): import os + self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.mappability = self.root + "mappability.bw" - self.chrNameBam = '2L' - self.chrNameBit = 'chr2L' - bam, mapped, unmapped, stats = bamHandler.openBam(self.bamFile, returnStats=True) + self.chrNameBam = "2L" + self.chrNameBit = "chr2L" + bam, mapped, unmapped, stats = bamHandler.openBam( + self.bamFile, returnStats=True + ) tbit = py2bit.open(self.tbitFile) global debug debug = 0 global global_vars - global_vars = {'2bit': self.tbitFile, - 'bam': self.bamFile, - 'filter_out': None, - 'mappability': self.mappability, - 'extra_sampling_file': None, - 'max_reads': 5, - 'min_reads': 0, - 'min_reads': 0, - 'reads_per_bp': 0.3, - 'total_reads': mapped, - 'genome_size': sum(tbit.chroms().values()) - } + global_vars = { + "2bit": self.tbitFile, + "bam": self.bamFile, + "filter_out": None, + "mappability": self.mappability, + "extra_sampling_file": None, + "max_reads": 5, + "min_reads": 0, + "min_reads": 0, + "reads_per_bp": 0.3, + "total_reads": mapped, + "genome_size": sum(tbit.chroms().values()), + } def testTabulateGCcontentWorker(self): stepSize = 2 - fragmentLength = {'min': 1, 'median': 3, 'max': 5} + fragmentLength = {"min": 1, "median": 3, "max": 5} start = 0 end = 20 - chrNameBam2bit = {'2L': 'chr2L'} - return (self.chrNameBam, - start, end, stepSize, fragmentLength, chrNameBam2bit) + chrNameBam2bit = {"2L": "chr2L"} + return (self.chrNameBam, start, end, stepSize, fragmentLength, chrNameBam2bit) def set_filter_out_file(self): - global global_vars - global_vars['filter_out'] = self.root + "filter_out.bed" + global_vars["filter_out"] = self.root + "filter_out.bed" def unset_filter_out_file(self): - global global_vars - global_vars['filter_out'] = None + global_vars["filter_out"] = None def set_extra_sampling_file(self): - global global_vars - global_vars['extra_sampling_file'] = self.root + "extra_sampling.bed" + global_vars["extra_sampling_file"] = self.root + "extra_sampling.bed" def testTabulateGCcontent(self): - fragmentLength = {'median': 10} - chrNameBitToBam = {'chr2L': '2L'} + fragmentLength = {"median": 10} + chrNameBitToBam = {"chr2L": "2L"} stepSize = 1 - bam = bamHandler.openBam(global_vars['bam']) - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] - return (fragmentLength, - chrNameBitToBam, stepSize, chromSizes, 1) + bam = bamHandler.openBam(global_vars["bam"]) + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] + return (fragmentLength, chrNameBitToBam, stepSize, chromSizes, 1) def testCountReadsPerGC(self): regionSize = 300 - chrNameBitToBam = {'chr2L': '2L'} + chrNameBitToBam = {"chr2L": "2L"} stepSize = 1 - bam = bamHandler.openBam(global_vars['bam']) - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] - return (regionSize, - chrNameBitToBam, stepSize, chromSizes, 1) + bam = bamHandler.openBam(global_vars["bam"]) + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] + return (regionSize, chrNameBitToBam, stepSize, chromSizes, 1) if __name__ == "__main__": diff --git a/deeptools/computeMatrixOperations.py b/deeptools/computeMatrixOperations.py index 0224f00a39..8e03070521 100755 --- a/deeptools/computeMatrixOperations.py +++ b/deeptools/computeMatrixOperations.py @@ -7,6 +7,7 @@ import os import csv from importlib.metadata import version +import warnings def parse_arguments(): @@ -427,7 +428,7 @@ def filterHeatmapValues(hm, minVal, maxVal): minVal = -np.inf if maxVal is None: maxVal = np.inf - np.warnings.filterwarnings('ignore') + warnings.filterwarnings('ignore') for i, (x, y) in enumerate(zip(np.nanmin(hm.matrix.matrix, axis=1), np.nanmax(hm.matrix.matrix, axis=1))): # x/y will be nan iff a row is entirely nan. Don't filter. if np.isnan(x) or (x >= minVal and y <= maxVal): diff --git a/deeptools/correctGCBias.py b/deeptools/correctGCBias.py index 1154b93688..048bf4a64c 100755 --- a/deeptools/correctGCBias.py +++ b/deeptools/correctGCBias.py @@ -20,7 +20,7 @@ from deeptools import utilities from deeptools.bamHandler import openBam -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): @@ -29,21 +29,22 @@ def parse_arguments(args=None): parser = argparse.ArgumentParser( parents=[requiredArgs, parentParser], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool corrects the GC-bias using the' - ' method proposed by [Benjamini & Speed (2012). ' - 'Nucleic Acids Research, 40(10)]. It will remove reads' - ' from regions with too high coverage compared to the' - ' expected values (typically GC-rich regions) and will' - ' add reads to regions where too few reads are seen ' - '(typically AT-rich regions). ' - 'The tool ``computeGCBias`` needs to be run first to generate the ' - 'frequency table needed here.', - usage='correctGCBias ' - '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit ' - '--GCbiasFrequenciesFile freq.txt -o gc_corrected.bam\n' - 'help: correctGCBias -h / correctGCBias --help', - conflict_handler='resolve', - add_help=False) + description="This tool corrects the GC-bias using the" + " method proposed by [Benjamini & Speed (2012). " + "Nucleic Acids Research, 40(10)]. It will remove reads" + " from regions with too high coverage compared to the" + " expected values (typically GC-rich regions) and will" + " add reads to regions where too few reads are seen " + "(typically AT-rich regions). " + "The tool ``computeGCBias`` needs to be run first to generate the " + "frequency table needed here.", + usage="correctGCBias " + "-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit " + "--GCbiasFrequenciesFile freq.txt -o gc_corrected.bam\n" + "help: correctGCBias -h / correctGCBias --help", + conflict_handler="resolve", + add_help=False, + ) return parser @@ -56,59 +57,74 @@ def process_args(args=None): def getRequiredArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bamfile', '-b', - metavar='BAM file', - help='Sorted BAM file to correct.', - required=True) - required.add_argument('--effectiveGenomeSize', - help='The effective genome size is the portion ' - 'of the genome that is mappable. Large fractions of ' - 'the genome are stretches of NNNN that should be ' - 'discarded. Also, if repetitive regions were not ' - 'included in the mapping of reads, the effective ' - 'genome size needs to be adjusted accordingly. ' - 'A table of values is available here: ' - 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .', - default=None, - type=int, - required=True) - - required.add_argument('--genome', '-g', - help='Genome in two bit format. Most genomes can be ' - 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ ' - 'Search for the .2bit ending. Otherwise, fasta ' - 'files can be converted to 2bit using faToTwoBit ' - 'available here: ' - 'http://hgdownload.cse.ucsc.edu/admin/exe/', - metavar='two bit file', - required=True) - - required.add_argument('--GCbiasFrequenciesFile', '-freq', - help='Indicate the output file from ' - 'computeGCBias containing ' - 'the observed and expected read frequencies per GC-' - 'content.', - type=argparse.FileType('r'), - metavar='FILE', - required=True) - - output = parser.add_argument_group('Output options') - output.add_argument('--correctedFile', '-o', - help='Name of the corrected file. The ending will ' - 'be used to decide the output file format. The options ' - 'are ".bam", ".bw" for a bigWig file, ".bg" for a ' - 'bedGraph file.', - metavar='FILE', - type=argparse.FileType('w'), - required=True) + required.add_argument( + "--bamfile", + "-b", + metavar="BAM file", + help="Sorted BAM file to correct.", + required=True, + ) + required.add_argument( + "--effectiveGenomeSize", + help="The effective genome size is the portion " + "of the genome that is mappable. Large fractions of " + "the genome are stretches of NNNN that should be " + "discarded. Also, if repetitive regions were not " + "included in the mapping of reads, the effective " + "genome size needs to be adjusted accordingly. " + "A table of values is available here: " + "http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .", + default=None, + type=int, + required=True, + ) + + required.add_argument( + "--genome", + "-g", + help="Genome in two bit format. Most genomes can be " + "found here: http://hgdownload.cse.ucsc.edu/gbdb/ " + "Search for the .2bit ending. Otherwise, fasta " + "files can be converted to 2bit using faToTwoBit " + "available here: " + "http://hgdownload.cse.ucsc.edu/admin/exe/", + metavar="two bit file", + required=True, + ) + + required.add_argument( + "--GCbiasFrequenciesFile", + "-freq", + help="Indicate the output file from " + "computeGCBias containing " + "the observed and expected read frequencies per GC-" + "content.", + type=argparse.FileType("r"), + metavar="FILE", + required=True, + ) + + output = parser.add_argument_group("Output options") + output.add_argument( + "--correctedFile", + "-o", + help="Name of the corrected file. The ending will " + "be used to decide the output file format. The options " + 'are ".bam", ".bw" for a bigWig file, ".bg" for a ' + "bedGraph file.", + metavar="FILE", + type=argparse.FileType("w"), + required=True, + ) # define the optional arguments - optional = parser.add_argument_group('Optional arguments') - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) return parser @@ -174,15 +190,14 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n'] >>> os.remove(tempFile) """ - global R_gc fragmentLength = len(R_gc) - 1 cvg_corr = np.zeros(end - start) i = 0 - tbit = py2bit.open(global_vars['2bit']) - bam = openBam(global_vars['bam']) + tbit = py2bit.open(global_vars["2bit"]) + bam = openBam(global_vars["bam"]) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() @@ -191,8 +206,7 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position - reads = [r for r in bam.fetch(chrNameBam, start, end) - if r.flag & 4 == 0] + reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() @@ -203,8 +217,7 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r_index += 1 try: # calculate GC content of read fragment - gc = getReadGCcontent(tbit, read, fragmentLength, - chrNameBit) + gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print(detail) """ this exception happens when the end of a @@ -214,18 +227,23 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): continue # is this read in the same orientation and position as the previous? - if r_index > 0 and read.pos == reads[r_index - 1].pos and \ - read.is_reverse == reads[r_index - 1].is_reverse \ - and read.pnext == reads[r_index - 1].pnext: + if ( + r_index > 0 and + read.pos == reads[r_index - 1].pos and + read.is_reverse == reads[r_index - 1].is_reverse and + read.pnext == reads[r_index - 1].pnext + ): read_repetitions += 1 - if read_repetitions >= global_vars['max_dup_gc'][gc]: + if read_repetitions >= global_vars["max_dup_gc"][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: - fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True) + fragmentStart, fragmentEnd = getFragmentFromRead( + read, fragmentLength, extendPairedEnds=True + ) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: @@ -239,25 +257,33 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): try: if debug: endTime = time.time() - print("{}, processing {} ({:.1f} per sec) " - "reads @ {}:{}-{}".format(multiprocessing.current_process().name, - i, i / (endTime - startTime), - chrNameBit, start, end)) + print( + "{}, processing {} ({:.1f} per sec) " + "reads @ {}:{}-{}".format( + multiprocessing.current_process().name, + i, + i / (endTime - startTime), + chrNameBit, + start, + end, + ) + ) except NameError: pass if i == 0: return None - _file = open(utilities.getTempFileName(suffix='.bg'), 'w') + _file = open(utilities.getTempFileName(suffix=".bg"), "w") # save in bedgraph format for bin in range(0, len(cvg_corr), step): - value = np.mean(cvg_corr[bin:min(bin + step, end)]) + value = np.mean(cvg_corr[bin: min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) - _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, - writeEnd, value)) + _file.write( + "%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value) + ) tempFileName = _file.name _file.close() @@ -288,10 +314,15 @@ def writeCorrectedSam_wrapper(args): return writeCorrectedSam_worker(*args) -def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, - step=None, - tag_but_not_change_number=False, - verbose=True): +def writeCorrectedSam_worker( + chrNameBam, + chrNameBit, + start, + end, + step=None, + tag_but_not_change_number=False, + verbose=True, +): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** @@ -326,19 +357,18 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ - global R_gc fragmentLength = len(R_gc) - 1 if verbose: print("Sam for %s %s %s " % (chrNameBit, start, end)) i = 0 - tbit = py2bit.open(global_vars['2bit']) + tbit = py2bit.open(global_vars["2bit"]) - bam = openBam(global_vars['bam']) - tempFileName = utilities.getTempFileName(suffix='.bam') + bam = openBam(global_vars["bam"]) + tempFileName = utilities.getTempFileName(suffix=".bam") - outfile = pysam.Samfile(tempFileName, 'wb', template=bam) + outfile = pysam.Samfile(tempFileName, "wb", template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 @@ -347,8 +377,11 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position - reads = [r for r in bam.fetch(chrNameBam, start, end) - if r.pos > start and r.flag & 4 == 0] + reads = [ + r + for r in bam.fetch(chrNameBam, start, end) + if r.pos > start and r.flag & 4 == 0 + ] r_index = -1 for read in reads: @@ -361,26 +394,29 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, # check if a mate has already been procesed # to apply the same correction try: - copies = matePairs[read.qname]['copies'] - gc = matePairs[read.qname]['gc'] + copies = matePairs[read.qname]["copies"] + gc = matePairs[read.qname]["gc"] del matePairs[read.qname] except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering - gc = getReadGCcontent(tbit, read, fragmentLength, - chrNameBit) + gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? - if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ - and read.is_reverse == reads[r_index - 1].is_reverse \ - and read.pnext == reads[r_index - 1].pnext: + if ( + gc and + r_index > 0 and + read.pos == reads[r_index - 1].pos and + read.is_reverse == reads[r_index - 1].is_reverse and + read.pnext == reads[r_index - 1].pnext + ): read_repetitions += 1 - if read_repetitions >= global_vars['max_dup_gc'][gc]: + if read_repetitions >= global_vars["max_dup_gc"][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: @@ -405,23 +441,23 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, replace_tags = True if gc: - GC = int(100 * np.round(float(gc) / fragmentLength, - decimals=2)) - readTag.append( - ('YC', float(round(float(1) / R_gc[gc], 2)), "f")) - readTag.append(('YN', copies, "i")) + GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) + readTag.append(("YC", float(round(float(1) / R_gc[gc], 2)), "f")) + readTag.append(("YN", copies, "i")) else: GC = -1 - readTag.append(('YG', GC, "i")) + readTag.append(("YG", GC, "i")) if replace_tags: read.set_tags(readTag) - if read.is_paired and read.is_proper_pair \ - and not read.mate_is_unmapped \ - and not read.is_reverse: - matePairs[readName] = {'copies': copies, - 'gc': gc} + if ( + read.is_paired and + read.is_proper_pair and + not read.mate_is_unmapped and + not read.is_reverse + ): + matePairs[readName] = {"copies": copies, "gc": gc} """ outfile.write(read) @@ -440,23 +476,40 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() - print("{}, processing {} ({:.1f} per sec) reads " - "@ {}:{}-{}".format(multiprocessing.current_process().name, - i, i / (endTime - startTime), - chrNameBit, start, end)) + print( + "{}, processing {} ({:.1f} per sec) reads " + "@ {}:{}-{}".format( + multiprocessing.current_process().name, + i, + i / (endTime - startTime), + chrNameBit, + start, + end, + ) + ) i += 1 outfile.close() if verbose: endTime = time.time() - print("{}, processing {} ({:.1f} per sec) reads " - "@ {}:{}-{}".format(multiprocessing.current_process().name, - i, i / (endTime - startTime), - chrNameBit, start, end)) - percentage = float(removed_duplicated_reads) * 100 / len(reads) \ - if len(reads) > 0 else 0 - print("duplicated reads removed %d of %d (%.2f) " % - (removed_duplicated_reads, len(reads), percentage)) + print( + "{}, processing {} ({:.1f} per sec) reads " + "@ {}:{}-{}".format( + multiprocessing.current_process().name, + i, + i / (endTime - startTime), + chrNameBit, + start, + end, + ) + ) + percentage = ( + float(removed_duplicated_reads) * 100 / len(reads) if len(reads) > 0 else 0 + ) + print( + "duplicated reads removed %d of %d (%.2f) " + % (removed_duplicated_reads, len(reads), percentage) + ) return tempFileName @@ -542,10 +595,10 @@ def run_shell_command(command): subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as error: - sys.stderr.write('Error{}\n'.format(error)) + sys.stderr.write("Error{}\n".format(error)) exit(1) except Exception as error: - sys.stderr.write('Error: {}\n'.format(error)) + sys.stderr.write("Error: {}\n".format(error)) exit(1) @@ -561,42 +614,47 @@ def main(args=None): global global_vars global_vars = {} - global_vars['2bit'] = args.genome - global_vars['bam'] = args.bamfile + global_vars["2bit"] = args.genome + global_vars["bam"] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that - max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) - if F_gc[x] > 0 and N_gc[x] > 0 else 1 - for x in range(len(F_gc))] + max_dup_gc = [ + binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 + for x in range(len(F_gc)) + ] - global_vars['max_dup_gc'] = max_dup_gc + global_vars["max_dup_gc"] = max_dup_gc - tbit = py2bit.open(global_vars['2bit']) - bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) + tbit = py2bit.open(global_vars["2bit"]) + bam, mapped, unmapped, stats = openBam( + args.bamfile, returnStats=True, nThreads=args.numberOfProcessors + ) - global_vars['genome_size'] = sum(tbit.chroms().values()) - global_vars['total_reads'] = mapped - global_vars['reads_per_bp'] = \ - float(global_vars['total_reads']) / args.effectiveGenomeSize + global_vars["genome_size"] = sum(tbit.chroms().values()) + global_vars["total_reads"] = mapped + global_vars["reads_per_bp"] = ( + float(global_vars["total_reads"]) / args.effectiveGenomeSize + ) # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) - chunkSize = int(4e5 / global_vars['reads_per_bp']) + chunkSize = int(4e5 / global_vars["reads_per_bp"]) # chromSizes: list of tuples - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] regionStart = 0 if args.region: - chromSizes, regionStart, regionEnd, chunkSize = \ - mapReduce.getUserRegion(chromSizes, args.region, - max_chunk_size=chunkSize) + chromSizes, regionStart, regionEnd, chunkSize = mapReduce.getUserRegion( + chromSizes, args.region, max_chunk_size=chunkSize + ) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) @@ -617,20 +675,21 @@ def main(args=None): print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) - mp_args.append((chrom, chrNameBamToBit[chrom], i, length, - bedGraphStep)) + mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) - if args.correctedFile.name.endswith('bam'): + if args.correctedFile.name.endswith("bam"): if len(mp_args) > 1 and args.numberOfProcessors > 1: - print(("using {} processors for {} " - "number of tasks".format(args.numberOfProcessors, - len(mp_args)))) - - res = pool.map_async( - writeCorrectedSam_wrapper, mp_args).get(9999999) + print( + ( + "using {} processors for {} " + "number of tasks".format(args.numberOfProcessors, len(mp_args)) + ) + ) + + res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) @@ -655,8 +714,7 @@ def main(args=None): for tempFileName in res: os.remove(tempFileName) - if args.correctedFile.name.endswith('bg') or \ - args.correctedFile.name.endswith('bw'): + if args.correctedFile.name.endswith("bg") or args.correctedFile.name.endswith("bw"): if len(mp_args) > 1 and args.numberOfProcessors > 1: @@ -666,11 +724,11 @@ def main(args=None): oname = args.correctedFile.name args.correctedFile.close() - if oname.endswith('bg'): - f = open(oname, 'wb') + if oname.endswith("bg"): + f = open(oname, "wb") for tempFileName in res: if tempFileName: - shutil.copyfileobj(open(tempFileName, 'rb'), f) + shutil.copyfileobj(open(tempFileName, "rb"), f) os.remove(tempFileName) f.close() else: @@ -678,68 +736,65 @@ def main(args=None): writeBedGraph.bedGraphToBigWig(chromSizes, res, oname) -class Tester(): +class Tester: def __init__(self): import os + self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" - self.chrNameBam = '2L' - self.chrNameBit = 'chr2L' + self.chrNameBam = "2L" + self.chrNameBit = "chr2L" bam, mapped, unmapped, stats = openBam(self.bamFile, returnStats=True) tbit = py2bit.open(self.tbitFile) global debug debug = 0 global global_vars - global_vars = {'2bit': self.tbitFile, - 'bam': self.bamFile, - 'filter_out': None, - 'extra_sampling_file': None, - 'max_reads': 5, - 'min_reads': 0, - 'min_reads': 0, - 'reads_per_bp': 0.3, - 'total_reads': mapped, - 'genome_size': sum(tbit.chroms().values())} + global_vars = { + "2bit": self.tbitFile, + "bam": self.bamFile, + "filter_out": None, + "extra_sampling_file": None, + "max_reads": 5, + "min_reads": 0, + "min_reads": 0, + "reads_per_bp": 0.3, + "total_reads": mapped, + "genome_size": sum(tbit.chroms().values()), + } def testWriteCorrectedChunk(self): - """ prepare arguments for test - """ - global R_gc, R_gc_min, R_gc_max + """prepare arguments for test""" + global R_gc R_gc = np.loadtxt(self.root + "R_gc_paired.txt") - global_vars['max_dup_gc'] = np.ones(301) + global_vars["max_dup_gc"] = np.ones(301) start = 200 end = 300 bedGraphStep = 25 - return (self.chrNameBam, - self.chrNameBit, start, end, bedGraphStep) + return (self.chrNameBam, self.chrNameBit, start, end, bedGraphStep) def testWriteCorrectedSam(self): - """ prepare arguments for test - """ - global R_gc, R_gc_min, R_gc_max + """prepare arguments for test""" + global R_gc R_gc = np.loadtxt(self.root + "R_gc_paired.txt") - global_vars['max_dup_gc'] = np.ones(301) + global_vars["max_dup_gc"] = np.ones(301) start = 200 end = 250 - return (self.chrNameBam, - self.chrNameBit, start, end) + return (self.chrNameBam, self.chrNameBit, start, end) def testWriteCorrectedSam_paired(self): - """ prepare arguments for test. - """ - global R_gc, R_gc_min, R_gc_max + """prepare arguments for test.""" + global R_gc R_gc = np.loadtxt(self.root + "R_gc_paired.txt") start = 0 end = 500 - global global_vars - global_vars['bam'] = self.root + "paired.bam" - return 'chr2L', 'chr2L', start, end + global_vars["bam"] = self.root + "paired.bam" + return "chr2L", "chr2L", start, end if __name__ == "__main__": diff --git a/deeptools/heatmapper.py b/deeptools/heatmapper.py index f86b85e234..941e91f17f 100644 --- a/deeptools/heatmapper.py +++ b/deeptools/heatmapper.py @@ -1308,9 +1308,9 @@ def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_sam _clustered_matrix = [] cluster_number = 1 for cluster in cluster_order: - self.group_labels.append("cluster_{}".format(cluster_number)) - cluster_number += 1 cluster_ids = _cluster_ids_list[cluster] + self.group_labels.append("cluster_{}_n{}".format(cluster_number, len(cluster_ids))) + cluster_number += 1 self.group_boundaries.append(self.group_boundaries[-1] + len(cluster_ids)) _clustered_matrix.append(self.matrix[cluster_ids, :]) diff --git a/deeptools/heatmapper_utilities.py b/deeptools/heatmapper_utilities.py index e63dfb0226..13d1bb47ed 100644 --- a/deeptools/heatmapper_utilities.py +++ b/deeptools/heatmapper_utilities.py @@ -1,16 +1,21 @@ import numpy as np import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.colors as pltcolors import plotly.graph_objs as go -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") + +debug = 0 +if debug: + from ipdb import set_trace -def plot_single(ax, ma, average_type, color, label, plot_type='lines'): +def plot_single(ax, ma, average_type, color, label, plot_type="lines"): """ Adds a line to the plot in the given ax using the specified method @@ -18,7 +23,7 @@ def plot_single(ax, ma, average_type, color, label, plot_type='lines'): ---------- ax : matplotlib axis matplotlib axis - ma : numpy array + ma : numpy array or list of numpy array(for plot with --repgrplist, take average between replicates ) numpy array The data on this matrix is summarized according to the `average_type` argument. average_type : str @@ -32,7 +37,9 @@ def plot_single(ax, ma, average_type, color, label, plot_type='lines'): type of plot. Either 'se' for standard error, 'std' for standard deviation, 'overlapped_lines' to plot each line of the matrix, fill to plot the area between the x axis and the value or any other string to - just plot the average line. + just plot the average line. When assign samples to replicates group such as + '--repgrplist WT WT KO KO' : 'std' would be the standard deviation between replicates groups. + 'se' for standard error between replicates groups. Returns ------- @@ -63,17 +70,27 @@ def plot_single(ax, ma, average_type, color, label, plot_type='lines'): """ - summary = np.ma.__getattribute__(average_type)(ma, axis=0) + if isinstance(ma, list): + summary_list = [] + for ma_each in ma: + tmp = np.ma.__getattribute__(average_type)(ma_each, axis=0) + summary_list.append(tmp) + ma = np.array(summary_list) + summary = np.ma.__getattribute__("average")(ma, axis=0) + else: + summary = np.ma.__getattribute__(average_type)(ma, axis=0) # only plot the average profiles without error regions x = np.arange(len(summary)) if isinstance(color, np.ndarray): color = pltcolors.to_hex(color, keep_alpha=True) ax.plot(x, summary, color=color, label=label, alpha=0.9) - if plot_type == 'fill': - ax.fill_between(x, summary, facecolor=color, alpha=0.6, edgecolor='none') + if plot_type == "fill": + ax.fill_between(x, summary, facecolor=color, alpha=0.6, edgecolor="none") - if plot_type in ['se', 'std']: - if plot_type == 'se': # standard error + if debug: + set_trace() + if plot_type in ["se", "std"]: + if plot_type == "se": # standard error std = np.std(ma, axis=0) / np.sqrt(ma.shape[0]) else: std = np.std(ma, axis=0) @@ -83,39 +100,49 @@ def plot_single(ax, ma, average_type, color, label, plot_type='lines'): # between the mean (or median etc.) and the std or se f_color = pltcolors.colorConverter.to_rgba(color, alpha) - ax.fill_between(x, summary, summary + std, facecolor=f_color, edgecolor='none') - ax.fill_between(x, summary, summary - std, facecolor=f_color, edgecolor='none') + ax.fill_between(x, summary, summary + std, facecolor=f_color, edgecolor="none") + ax.fill_between(x, summary, summary - std, facecolor=f_color, edgecolor="none") ax.set_xlim(0, max(x)) return ax -def plotly_single(ma, average_type, color, label, plot_type='line'): +def plotly_single(ma, average_type, color, label, plot_type="line"): """A plotly version of plot_single. Returns a list of traces""" summary = list(np.ma.__getattribute__(average_type)(ma, axis=0)) x = list(np.arange(len(summary))) if isinstance(color, str): color = list(matplotlib.colors.to_rgb(color)) - traces = [go.Scatter(x=x, y=summary, name=label, line={'color': "rgba({},{},{},0.9)".format(color[0], color[1], color[2])}, showlegend=False)] - if plot_type == 'fill': - traces[0].update(fill='tozeroy', fillcolor=color) - - if plot_type in ['se', 'std']: - if plot_type == 'se': # standard error + traces = [ + go.Scatter( + x=x, + y=summary, + name=label, + line={"color": "rgba({},{},{},0.9)".format(color[0], color[1], color[2])}, + showlegend=False, + ) + ] + if plot_type == "fill": + traces[0].update(fill="tozeroy", fillcolor=color) + + if plot_type in ["se", "std"]: + if plot_type == "se": # standard error std = np.std(ma, axis=0) / np.sqrt(ma.shape[0]) else: std = np.std(ma, axis=0) x_rev = x[::-1] lower = summary - std - trace = go.Scatter(x=x + x_rev, - y=np.concatenate([summary + std, lower[::-1]]), - fill='tozerox', - fillcolor="rgba({},{},{},0.2)".format(color[0], color[1], color[2]), - line=go.Line(color='transparent'), - showlegend=False, - name=label) + trace = go.Scatter( + x=x + x_rev, + y=np.concatenate([summary + std, lower[::-1]]), + fill="tozerox", + fillcolor="rgba({},{},{},0.2)".format(color[0], color[1], color[2]), + line=go.Line(color="transparent"), + showlegend=False, + name=label, + ) traces.append(trace) return traces @@ -133,42 +160,44 @@ def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx): As of matplotlib 3.1 there is no longer padding added to all ticks. Reference point ticks will be adjusted by width/2 or width for spacing and the last half of scaled ticks will be shifed by 1 bin so the ticks are at the beginning of bins. """ - w = hm.parameters['bin size'] - b = hm.parameters['upstream'] - a = hm.parameters['downstream'] + w = hm.parameters["bin size"] + b = hm.parameters["upstream"] + a = hm.parameters["downstream"] if idx is not None: w = w[idx] b = b[idx] a = a[idx] try: - c = hm.parameters['unscaled 5 prime'] + c = hm.parameters["unscaled 5 prime"] if idx is not None: c = c[idx] except: c = 0 try: - d = hm.parameters['unscaled 3 prime'] + d = hm.parameters["unscaled 3 prime"] if idx is not None: d = d[idx] except: d = 0 - m = hm.parameters['body'] + m = hm.parameters["body"] if idx is not None: m = m[idx] if b < 1e5: quotient = 1000 - symbol = 'Kb' + symbol = "Kb" else: quotient = 1e6 - symbol = 'Mb' + symbol = "Mb" if m == 0: xticks = [(k / w) for k in [0, b - 0.5 * w, b + a - w]] - xtickslabel = ['{0:.1f}'.format(-(float(b) / quotient)), - referencePointLabel, - '{0:.1f}{1}'.format(float(a) / quotient, symbol)] + xtickslabel = [ + "{0:.1f}".format(-(float(b) / quotient)), + referencePointLabel, + "{0:.1f}{1}".format(float(a) / quotient, symbol), + ] else: xticks_values = [0] xtickslabel = [] @@ -176,7 +205,7 @@ def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx): # only if upstream region is set, add a x tick if b > 0: xticks_values.append(b) - xtickslabel.append('{0:.1f}'.format(-(float(b) / quotient))) + xtickslabel.append("{0:.1f}".format(-(float(b) / quotient))) xtickslabel.append(startLabel) @@ -196,7 +225,7 @@ def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx): if a > 0: xticks_values.append(b + c + m + d + a - w) - xtickslabel.append('{0:.1f}{1}'.format(float(a) / quotient, symbol)) + xtickslabel.append("{0:.1f}{1}".format(float(a) / quotient, symbol)) xticks = [(k / w) for k in xticks_values] xticks = [max(x, 0) for x in xticks] diff --git a/deeptools/parserCommon.py b/deeptools/parserCommon.py index 9849d9c431..c64531fe43 100755 --- a/deeptools/parserCommon.py +++ b/deeptools/parserCommon.py @@ -851,6 +851,14 @@ def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]): 'group.', action='store_true') + optional.add_argument('--repgrplist', + default=None, + nargs='+', + help='Group profiles by genotype' + 'assign each profile to a group(as replicates) to plot average +/- se/std.' + 'If the number of group values is smaller than' + 'the number of samples, the values will be equally divide into groups.') + optional.add_argument('--plotFileFormat', metavar='', help='Image format type. If given, this ' diff --git a/deeptools/plotHeatmap.py b/deeptools/plotHeatmap.py index a2149f8299..761a9750de 100755 --- a/deeptools/plotHeatmap.py +++ b/deeptools/plotHeatmap.py @@ -30,6 +30,8 @@ old_settings = np.seterr(all='ignore') plt.ioff() +import re +from pdb import set_trace def parse_arguments(args=None): parser = argparse.ArgumentParser( @@ -108,7 +110,7 @@ def prepare_layout(hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGro # make height of summary plot # proportional to the width of heatmap sumplot_height = heatmapwidth - spacer_height = heatmapwidth / 8 + spacer_height = heatmapwidth / 24 # beisi # scale height_ratios to convert from row # numbers to heatmapheigt fractions height_ratio = np.concatenate([[sumplot_height, spacer_height], height_ratio]) @@ -117,6 +119,21 @@ def prepare_layout(hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGro return grids +def autobreaklinetitle(title, sep="[-_,.:]", lmax=15): # beisi + outsep = "-" + sss = [rr for rr in re.split(sep, title) if len(rr)] + newtitle, tmp = "", "" + for ss in sss: + tmp0 = tmp + tmp += ss + if len(tmp) > lmax: + newtitle += tmp0.strip(outsep) + "\n" + tmp = ss + else: + tmp += outsep + newtitle += tmp.strip(outsep) + newtitle = "\n" + newtitle + return newtitle def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType, plot_type, yAxisLabel, color_list, yMin, yMax, wspace, hspace, colorbar_position, label_rotation=0.0): """ @@ -146,7 +163,7 @@ def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType else: ax_profile = fig.add_subplot(grids[0, sample_id]) - ax_profile.set_title(title) + ax_profile.set_title(autobreaklinetitle(title)) # beisi for group in range(iterNum2): if perGroup: sub_matrix = hm.matrix.get_matrix(sample_id, group) @@ -163,7 +180,8 @@ def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType if sample_id > 0 and len(yMin) == 1 and len(yMax) == 1: plt.setp(ax_profile.get_yticklabels(), visible=False) - if sample_id == 0 and yAxisLabel != '': + ax_profile.get_yaxis().set_tick_params(direction="in", pad=-22) # beisi + if sample_id == 0 and yAxisLabel != "": ax_profile.set_ylabel(yAxisLabel) xticks, xtickslabel = hm.getTicks(tickIdx) if np.ceil(max(xticks)) != float(sub_matrix['matrix'].shape[1] - 1): @@ -518,7 +536,7 @@ def plotMatrix(hm, outFileName, else: total_figwidth += 1 / 2.54 - fig = plt.figure(figsize=(total_figwidth, figheight), constrained_layout=True) + fig = plt.figure(figsize=(total_figwidth, figheight), constrained_layout=True) #beisi fig.suptitle(plotTitle, y=1 - (0.06 / figheight)) grids = prepare_layout( @@ -589,10 +607,10 @@ def plotMatrix(hm, outFileName, iterNum = hm.matrix.get_num_samples() iterNum2 = numgroups ax_list = addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType, plot_type, yAxisLabel, color_list, yMin, yMax, None, None, colorbar_position, label_rotation) - - if legend_location != 'none': - ax_list[-1].legend(loc=legend_location.replace('-', ' '), ncol=1, prop=fontP, - frameon=False, markerscale=0.5) + + if legend_location != "none": #beisi start + legend = ax_list[-1].legend(loc='lower right', shadow=False, fontsize='x-large', bbox_to_anchor=(0.75, 1.35, 0.5, .50), ncol=10, frameon=False, prop=fontP) + legend.set_in_layout(False) #beisi end first_group = 0 # helper variable to place the title per sample/group for sample in range(hm.matrix.get_num_samples()): @@ -628,7 +646,7 @@ def plotMatrix(hm, outFileName, if group == first_group and not showSummaryPlot and not perGroup: title = hm.matrix.sample_labels[sample] - ax.set_title(title) + ax.set_title(autobreaklinetitle(title)) # beisi if box_around_heatmaps is False: # Turn off the boxes around the individual heatmaps @@ -760,7 +778,7 @@ def plotMatrix(hm, outFileName, fig.colorbar(img, cax=ax) if box_around_heatmaps: - fig.get_layout_engine().set(wspace=0.10, hspace=0.025, rect=(0.04, 0, 0.96, 0.85)) + fig.get_layout_engine().set(w_pad=0.04,h_pad=0.01,wspace=0, hspace=0, rect=(0.04, 0, 0.96, 0.95)) else: # When no box is plotted the space between heatmaps is reduced fig.get_layout_engine().set(wspace=0.05, hspace=0.01, rect=(0.04, 0, 0.96, 0.85)) diff --git a/deeptools/plotProfile.py b/deeptools/plotProfile.py index 7497875f20..e3672f75d3 100755 --- a/deeptools/plotProfile.py +++ b/deeptools/plotProfile.py @@ -3,14 +3,16 @@ import sys +import re import argparse import numpy as np from math import ceil import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" import deeptools.cm # noqa: F401 import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties @@ -26,32 +28,52 @@ from deeptools.heatmapper_utilities import plot_single, plotly_single, getProfileTicks from deeptools.computeMatrixOperations import filterHeatmapValues - debug = 0 -old_settings = np.seterr(all='ignore') +if debug: + from ipdb import set_trace +old_settings = np.seterr(all="ignore") plt.ioff() def parse_arguments(args=None): parser = argparse.ArgumentParser( - parents=[parserCommon.heatmapperMatrixArgs(), - parserCommon.heatmapperOutputArgs(mode='profile'), - parserCommon.heatmapperOptionalArgs(mode='profile')], + parents=[ + parserCommon.heatmapperMatrixArgs(), + parserCommon.heatmapperOutputArgs(mode="profile"), + parserCommon.heatmapperOptionalArgs(mode="profile"), + ], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool creates a profile plot for ' - 'scores over sets of genomic regions. ' - 'Typically, these regions are genes, but ' - 'any other regions defined in BED ' - ' will work. A matrix generated ' - 'by computeMatrix is required.', - epilog='An example usage is: plotProfile -m matrix.gz', + description="This tool creates a profile plot for " + "scores over sets of genomic regions. " + "Typically, these regions are genes, but " + "any other regions defined in BED " + " will work. A matrix generated " + "by computeMatrix is required.", + epilog="An example usage is: plotProfile -m matrix.gz", add_help=False, - usage='plotProfile -m matrix.gz\n' - 'help: plotProfile -h / plotProfile --help') + usage="plotProfile -m matrix.gz\n" "help: plotProfile -h / plotProfile --help", + ) return parser +def autobreaklinetitle(title, sep="[-_,.:]", lmax=15): + outsep = "-" + sss = [rr for rr in re.split(sep, title) if len(rr)] + newtitle, tmp = "", "" + for ss in sss: + tmp0 = tmp + tmp += ss + if len(tmp) > lmax: + newtitle += tmp0.strip(outsep) + "\n" + tmp = ss + else: + tmp += outsep + newtitle += tmp.strip(outsep) + newtitle = "\n" + newtitle + return newtitle + + def process_args(args=None): args = parse_arguments().parse_args(args) @@ -66,9 +88,9 @@ def process_args(args=None): args.yMax = [None] # Sometimes Galaxy sends --yMax '' and --yMin '' - if args.yMin == ['']: + if args.yMin == [""]: args.yMin = [None] - if args.yMax == ['']: + if args.yMax == [""]: args.yMax = [None] # Convert to floats @@ -89,22 +111,30 @@ def process_args(args=None): class Profile(object): - def __init__(self, hm, out_file_name, - plot_title='', y_axis_label='', - y_min=None, y_max=None, - averagetype='median', - reference_point_label=None, - start_label='TSS', end_label='TES', - plot_height=7, - plot_width=11, - per_group=False, - plot_type='lines', - image_format=None, - color_list=None, - legend_location='best', - plots_per_row=8, - label_rotation=0, - dpi=200): + def __init__( + self, + hm, + out_file_name, + plot_title="", + y_axis_label="", + y_min=None, + y_max=None, + averagetype="median", + reference_point_label=None, + start_label="TSS", + end_label="TES", + plot_height=7, + plot_width=11, + per_group=False, + repgrplist=None, + plot_type="lines", + image_format=None, + color_list=None, + legend_location="best", + plots_per_row=8, + label_rotation=0, + dpi=200, + ): """ Using the hm matrix, makes a line plot either per group or per sample @@ -124,6 +154,7 @@ def __init__(self, hm, out_file_name, plot_height: in cm plot_width: in cm per_group: bool + repgrplist: list plot_type: string image_format: string color_list: list @@ -147,6 +178,7 @@ def __init__(self, hm, out_file_name, self.plot_height = plot_height self.plot_width = plot_width self.per_group = per_group + self.repgrplist = repgrplist self.plot_type = plot_type self.image_format = image_format self.color_list = color_list @@ -157,12 +189,14 @@ def __init__(self, hm, out_file_name, # Honor reference point labels from computeMatrix if reference_point_label is None: - self.reference_point_label = hm.parameters['ref point'] - + self.reference_point_label = hm.parameters["ref point"] # decide how many plots are needed if self.per_group: self.numplots = self.hm.matrix.get_num_groups() - self.numlines = self.hm.matrix.get_num_samples() + if self.repgrplist: + self.numlines = len(set(self.repgrplist)) + else: + self.numlines = self.hm.matrix.get_num_samples() else: self.numplots = self.hm.matrix.get_num_samples() self.numlines = self.hm.matrix.get_num_groups() @@ -175,13 +209,15 @@ def __init__(self, hm, out_file_name, cols = self.numplots self.grids = gridspec.GridSpec(rows, cols) - plt.rcParams['font.size'] = 10.0 + plt.rcParams["font.size"] = 10.0 self.font_p = FontProperties() - self.font_p.set_size('small') + self.font_p.set_size("small") # convert cm values to inches plot_height_inches = rows * self.cm2inch(self.plot_height)[0] - self.fig = plt.figure(figsize=self.cm2inch(cols * self.plot_width, rows * self.plot_height)) + self.fig = plt.figure( + figsize=self.cm2inch(cols * self.plot_width, rows * self.plot_height) + ) self.fig.suptitle(self.plot_title, y=(1 - (0.06 / plot_height_inches))) # Ensure that the labels are vectors @@ -197,7 +233,13 @@ def getTicks(self, idx): """ This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks. """ - xticks, xtickslabel = getProfileTicks(self.hm, self.reference_point_label[idx], self.start_label[idx], self.end_label[idx], idx) + xticks, xtickslabel = getProfileTicks( + self.hm, + self.reference_point_label[idx], + self.start_label[idx], + self.end_label[idx], + idx, + ) return xticks, xtickslabel @staticmethod @@ -210,8 +252,9 @@ def cm2inch(*tupl): def plot_hexbin(self): from matplotlib import cm + cmap = cm.coolwarm - cmap.set_bad('black') + cmap.set_bad("black") if self.image_format == "plotly": return self.plotly_hexbin() @@ -224,17 +267,20 @@ def plot_hexbin(self): # split the ax to make room for the colorbar and for each of the # groups - sub_grid = gridspec.GridSpecFromSubplotSpec(self.numlines, 2, subplot_spec=self.grids[row, col], - width_ratios=[0.92, 0.08], wspace=0.05, hspace=0.1) + sub_grid = gridspec.GridSpecFromSubplotSpec( + self.numlines, + 2, + subplot_spec=self.grids[row, col], + width_ratios=[0.92, 0.08], + wspace=0.05, + hspace=0.1, + ) ax = self.fig.add_subplot(sub_grid[0, 0]) ax.tick_params( - axis='y', - which='both', - left=False, - right=False, - labelleft=True) + axis="y", which="both", left=False, right=False, labelleft=True + ) if self.per_group: title = self.hm.matrix.group_labels[plot] @@ -251,7 +297,7 @@ def plot_hexbin(self): _row, _col = data_idx, plot sub_matrix = self.hm.matrix.get_matrix(_row, _col) - ma = sub_matrix['matrix'] + ma = sub_matrix["matrix"] x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1)) img = ax.hexbin(x_values.flatten(), ma.flatten(), cmap=cmap, mincnt=1) _vmin, _vmax = img.get_clim() @@ -271,7 +317,7 @@ def plot_hexbin(self): for data_idx in range(self.numlines)[::-1]: ax = self.fig.add_subplot(sub_grid[data_idx, 0]) if data_idx == 0: - ax.set_title(title) + ax.set_title(autobreaklinetitle(title)) if data_idx != self.numlines - 1: plt.setp(ax.get_xticklabels(), visible=False) @@ -283,19 +329,26 @@ def plot_hexbin(self): sub_matrix = self.hm.matrix.get_matrix(_row, _col) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] - ma = sub_matrix['matrix'] + ma = sub_matrix["matrix"] try: # matplotlib 2.0 - ax.set_facecolor('black') + ax.set_facecolor("black") except: # matplotlib <2.0 - ax.set_axis_bgcolor('black') + ax.set_axis_bgcolor("black") x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1)) - img = ax.hexbin(x_values.flatten(), ma.flatten(), cmap=cmap, mincnt=1, vmin=vmin, vmax=vmax) + img = ax.hexbin( + x_values.flatten(), + ma.flatten(), + cmap=cmap, + mincnt=1, + vmin=vmin, + vmax=vmax, + ) if plot == 0: ax.axes.set_ylabel(label) @@ -313,7 +366,7 @@ def plot_hexbin(self): xticks, xtickslabel = self.getTicks(plot) if np.ceil(max(xticks)) != float(ma.shape[1] - 1): - tickscale = float(sub_matrix['matrix'].shape[1]) / max(xticks) + tickscale = float(sub_matrix["matrix"].shape[1]) / max(xticks) xticks_use = [x * tickscale for x in xticks] ax_list[0].axes.set_xticks(xticks_use) else: @@ -323,8 +376,8 @@ def plot_hexbin(self): # such that they don't fall off # the heatmap sides ticks = ax_list[-1].xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") cax = self.fig.add_subplot(sub_grid[:, 1]) self.fig.colorbar(img, cax=cax) @@ -337,11 +390,13 @@ def plot_hexbin(self): def plotly_hexbin(self): """plot_hexbin, but for plotly. it's annoying that we have to have sub-subplots""" fig = go.Figure() - cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + cols = ( + self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + ) rows = np.ceil(self.numplots / float(cols)).astype(int) - fig['layout'].update(title=self.plot_title) - domainWidth = .9 / cols - domainHeight = .9 / rows + fig["layout"].update(title=self.plot_title) + domainWidth = 0.9 / cols + domainHeight = 0.9 / rows bufferHeight = 0.0 if rows > 1: bufferHeight = 0.1 / (rows - 1) @@ -377,9 +432,21 @@ def plotly_hexbin(self): base = col * (domainWidth + bufferWidth) domain = [base, base + domainWidth] titleX = base + 0.5 * domainWidth - xanchor = 'x{}'.format(i + 1) - fig['layout']['xaxis{}'.format(i + 1)] = dict(domain=domain) - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False}) + xanchor = "x{}".format(i + 1) + fig["layout"]["xaxis{}".format(i + 1)] = dict(domain=domain) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": title, + "y": titleY, + "x": titleX, + "font": {"size": 16}, + "showarrow": False, + } + ) # set yMin/yMax yMin = np.inf @@ -391,7 +458,7 @@ def plotly_hexbin(self): else: _row, _col = j, i - ma = self.hm.matrix.get_matrix(_row, _col)['matrix'] + ma = self.hm.matrix.get_matrix(_row, _col)["matrix"] if np.min(ma) < yMin: yMin = np.min(ma) if np.max(ma) > yMax: @@ -407,30 +474,53 @@ def plotly_hexbin(self): else: _row, _col = j, i foo = i * self.numlines + j + 1 - yanchor = 'y{}'.format(foo) + yanchor = "y{}".format(foo) base = row * (domainHeight + bufferHeight) + j * subHeight domain = [base, base + subHeight] - fig['layout']['yaxis{}'.format(foo)] = {'domain': domain, 'title': self.y_axis_label, 'anchor': xanchor, 'range': [yMin, yMax]} + fig["layout"]["yaxis{}".format(foo)] = { + "domain": domain, + "title": self.y_axis_label, + "anchor": xanchor, + "range": [yMin, yMax], + } if j == 0: _ = "xaxis{}".format(xanchor[1:]) - fig['layout'][_].update(anchor='y{}'.format(foo)) + fig["layout"][_].update(anchor="y{}".format(foo)) if col == 0: titleY = base + 0.5 * subHeight - annos.append({'yanchor': 'middle', 'xref': 'paper', 'xanchor': 'left', 'yref': 'paper', 'text': sideLabels[j], 'y': titleY, 'x': -0.03, 'font': {'size': 16}, 'showarrow': False, 'textangle': -90}) + annos.append( + { + "yanchor": "middle", + "xref": "paper", + "xanchor": "left", + "yref": "paper", + "text": sideLabels[j], + "y": titleY, + "x": -0.03, + "font": {"size": 16}, + "showarrow": False, + "textangle": -90, + } + ) sub_matrix = self.hm.matrix.get_matrix(_row, _col) - ma = self.hm.matrix.get_matrix(_row, _col)['matrix'] + ma = self.hm.matrix.get_matrix(_row, _col)["matrix"] - fig['layout']['xaxis{}'.format(i + 1)].update(range=[0, ma.shape[1]]) + fig["layout"]["xaxis{}".format(i + 1)].update(range=[0, ma.shape[1]]) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] # Manually compute the 2D histogram with 100x100 bins x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1)) - z, xe, ye = np.histogram2d(x_values.flatten(), ma.flatten(), bins=100, range=[[0, ma.shape[1]], [yMin, yMax]]) + z, xe, ye = np.histogram2d( + x_values.flatten(), + ma.flatten(), + bins=100, + range=[[0, ma.shape[1]], [yMin, yMax]], + ) _vmin = np.min(z) _vmax = np.max(z) @@ -439,7 +529,15 @@ def plotly_hexbin(self): if _vmax > vmax: vmax = _vmax - trace = go.Contour(z=z.T, x=xe, y=ye, xaxis=xanchor, yaxis=yanchor, name=label, connectgaps=False) + trace = go.Contour( + z=z.T, + x=xe, + y=ye, + xaxis=xanchor, + yaxis=yanchor, + name=label, + connectgaps=False, + ) data.append(trace) # Assume the bounds for the last graph are correct @@ -451,18 +549,25 @@ def plotly_hexbin(self): else: xticks_use = xticks xticks_use = [np.ceil(x) for x in xticks_use] - fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation) + fig["layout"]["xaxis{}".format(i + 1)].update( + tickmode="array", + tickvals=xticks_use, + ticktext=xtickslabel, + tickangle=self.label_rotation, + ) for trace in data: trace.update(zmin=vmin, zmax=vmax) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=self.out_file_name, auto_open=False) def plot_heatmap(self): - cmap = ['RdYlBu_r'] - if self.color_list is not None: # check the length to be equal to the numebr of plots otherwise multiply it! + cmap = ["RdYlBu_r"] + if ( + self.color_list is not None + ): # check the length to be equal to the numebr of plots otherwise multiply it! cmap = self.color_list if len(cmap) < self.numplots: all_colors = cmap @@ -497,26 +602,28 @@ def plot_heatmap(self): localYMax = None # split the ax to make room for the colorbar - sub_grid = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=self.grids[row, col], - width_ratios=[0.92, 0.08], wspace=0.05) + sub_grid = gridspec.GridSpecFromSubplotSpec( + 1, + 2, + subplot_spec=self.grids[row, col], + width_ratios=[0.92, 0.08], + wspace=0.05, + ) ax = self.fig.add_subplot(sub_grid[0]) cax = self.fig.add_subplot(sub_grid[1]) ax.tick_params( - axis='y', - which='both', - left=False, - right=False, - labelleft=True) + axis="y", which="both", left=False, right=False, labelleft=True + ) if self.per_group: title = self.hm.matrix.group_labels[plot] - tickIdx = plot % self.hm.matrix.get_num_samples() + tickIdx = plot % self.numlines else: title = self.hm.matrix.sample_labels[plot] tickIdx = plot - ax.set_title(title) + ax.set_title(autobreaklinetitle(title)) mat = [] # when drawing a heatmap (in contrast to drawing lines) for data_idx in range(self.numlines): if self.per_group: @@ -531,13 +638,23 @@ def plot_heatmap(self): sub_matrix = self.hm.matrix.get_matrix(row, col) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] labels.append(label) - mat.append(np.ma.__getattribute__(self.averagetype)(sub_matrix['matrix'], axis=0)) - img = ax.imshow(np.vstack(mat), interpolation='nearest', - cmap=cmap[plot], aspect='auto', vmin=localYMin, vmax=localYMax) + mat.append( + np.ma.__getattribute__(self.averagetype)( + sub_matrix["matrix"], axis=0 + ) + ) + img = ax.imshow( + np.vstack(mat), + interpolation="nearest", + cmap=cmap[plot], + aspect="auto", + vmin=localYMin, + vmax=localYMax, + ) self.fig.colorbar(img, cax=cax) totalWidth = np.vstack(mat).shape[1] @@ -553,12 +670,14 @@ def plot_heatmap(self): # such that they don't fall off # the heatmap sides ticks = ax.xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") # add labels as y ticks labels ymin, ymax = ax.axes.get_ylim() - pos, distance = np.linspace(ymin, ymax, len(labels), retstep=True, endpoint=False) + pos, distance = np.linspace( + ymin, ymax, len(labels), retstep=True, endpoint=False + ) d_half = float(distance) / 2 yticks = [x + d_half for x in pos] @@ -582,11 +701,13 @@ def plot_heatmap(self): def plotly_heatmap(self): """plot_heatmap, but with plotly output""" fig = go.Figure() - cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + cols = ( + self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + ) rows = np.ceil(self.numplots / float(cols)).astype(int) - fig['layout'].update(title=self.plot_title) - domainWidth = .9 / cols - domainHeight = .9 / rows + fig["layout"].update(title=self.plot_title) + domainWidth = 0.9 / cols + domainHeight = 0.9 / rows bufferHeight = 0.0 if rows > 1: bufferHeight = 0.1 / (rows - 1) @@ -614,17 +735,36 @@ def plotly_heatmap(self): base = row * (domainHeight + bufferHeight) domain = [base, base + domainHeight] titleY = base + domainHeight - xanchor = 'x{}'.format(i + 1) - yanchor = 'y{}'.format(i + 1) + xanchor = "x{}".format(i + 1) + yanchor = "y{}".format(i + 1) visible = False if col == 0: visible = True - fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': xanchor, 'visible': visible} + fig["layout"]["yaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": xanchor, + "visible": visible, + } base = col * (domainWidth + bufferWidth) domain = [base, base + domainWidth] titleX = base + 0.5 * domainWidth - fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor} - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False}) + fig["layout"]["xaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": yanchor, + } + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": title, + "y": titleY, + "x": titleX, + "font": {"size": 16}, + "showarrow": False, + } + ) mat = [] labels = [] @@ -637,17 +777,28 @@ def plotly_heatmap(self): sub_matrix = self.hm.matrix.get_matrix(row, col) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] labels.append(label) - mat.append(np.ma.__getattribute__(self.averagetype)(sub_matrix['matrix'], axis=0)) + mat.append( + np.ma.__getattribute__(self.averagetype)( + sub_matrix["matrix"], axis=0 + ) + ) if np.min(mat[-1]) < zmin: zmin = np.min(mat[-1]) if np.max(mat[-1]) > zmax: zmax = np.max(mat[-1]) totalWidth = len(mat[-1]) - trace = go.Heatmap(name=title, z=mat, x=range(totalWidth + 1), y=labels, xaxis=xanchor, yaxis=yanchor) + trace = go.Heatmap( + name=title, + z=mat, + x=range(totalWidth + 1), + y=labels, + xaxis=xanchor, + yaxis=yanchor, + ) data.append(trace) # Add ticks @@ -658,7 +809,12 @@ def plotly_heatmap(self): else: xticks_use = xticks xticks_use = [np.ceil(x) for x in xticks_use] - fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation) + fig["layout"]["xaxis{}".format(i + 1)].update( + tickmode="array", + tickvals=xticks_use, + ticktext=xtickslabel, + tickangle=self.label_rotation, + ) # Adjust color scale limits for i, trace in enumerate(data): @@ -671,7 +827,7 @@ def plotly_heatmap(self): trace.update(zmin=zminUse, zmax=zmaxUse) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=self.out_file_name, auto_open=False) def plot_profile(self): @@ -681,21 +837,30 @@ def plot_profile(self): self.y_max = [None] if not self.color_list: - cmap_plot = plt.get_cmap('jet') + cmap_plot = plt.get_cmap("jet") if self.numlines > 1: # kmeans, so we need to color by cluster - self.color_list = cmap_plot(np.arange(self.numlines, dtype=float) / float(self.numlines)) + self.color_list = cmap_plot( + np.arange(self.numlines, dtype=float) / float(self.numlines) + ) else: - self.color_list = cmap_plot(np.arange(self.numplots, dtype=float) / float(self.numplots)) - if (self.numlines > 1 and len(self.color_list) < self.numlines) or\ - (self.numlines == 1 and len(self.color_list) < self.numplots): - sys.exit("\nThe given list of colors is too small, " - "at least {} colors are needed\n".format(self.numlines)) + self.color_list = cmap_plot( + np.arange(self.numplots, dtype=float) / float(self.numplots) + ) + if (self.numlines > 1 and len(self.color_list) < self.numlines) or ( + self.numlines == 1 and len(self.color_list) < self.numplots + ): + sys.exit( + "\nThe given list of colors is too small, " + "at least {} colors are needed\n".format(self.numlines) + ) for color in self.color_list: if not pltcolors.is_color_like(color): - sys.exit("\nThe color name {} is not valid. Check " - "the name or try with a html hex string " - "for example #eeff22".format(color)) + sys.exit( + "\nThe color name {} is not valid. Check " + "the name or try with a html hex string " + "for example #eeff22".format(color) + ) if self.image_format == "plotly": return self.plotly_profile() @@ -718,48 +883,105 @@ def plot_profile(self): title = self.hm.matrix.group_labels[plot] if row != 0 and len(self.y_min) == 1 and len(self.y_max) == 1: plt.setp(ax.get_yticklabels(), visible=False) - tickIdx = plot % self.hm.matrix.get_num_samples() + tickIdx = plot % self.numlines else: title = self.hm.matrix.sample_labels[plot] if col != 0 and len(self.y_min) == 1 and len(self.y_max) == 1: plt.setp(ax.get_yticklabels(), visible=False) tickIdx = plot - ax.set_title(title) - for data_idx in range(self.numlines): - if self.per_group: - _row, _col = plot, data_idx - else: - _row, _col = data_idx, plot - if localYMin is None or self.y_min[col % len(self.y_min)] < localYMin: - localYMin = self.y_min[col % len(self.y_min)] - if localYMax is None or self.y_max[col % len(self.y_max)] > localYMax: - localYMax = self.y_max[col % len(self.y_max)] - - sub_matrix = self.hm.matrix.get_matrix(_row, _col) - - if self.per_group: - label = sub_matrix['sample'] - else: - label = sub_matrix['group'] - - if self.numlines > 1: - coloridx = data_idx - else: - coloridx = plot - plot_single(ax, sub_matrix['matrix'], - self.averagetype, - self.color_list[coloridx], - label, - plot_type=self.plot_type) - globalYmin = min(float(globalYmin), ax.get_ylim()[0]) + ax.set_title(autobreaklinetitle(title)) + if localYMin is None or self.y_min[col % len(self.y_min)] < localYMin: + localYMin = self.y_min[col % len(self.y_min)] + if localYMax is None or self.y_max[col % len(self.y_max)] > localYMax: + localYMax = self.y_max[col % len(self.y_max)] + if self.per_group and self.repgrplist: + nsamptmp = self.hm.matrix.get_num_samples() + repgrp_samp_dict = {} + repgrplistuniq = [] + for tmp in self.repgrplist: + if tmp not in repgrplistuniq: + repgrplistuniq.append(tmp) + + for data_idx in range(nsamptmp): + if len(self.repgrplist) >= nsamptmp: + thisrepgrp = self.repgrplist[data_idx] + else: + thisrepgrp = repgrplistuniq[ + int(data_idx / (nsamptmp / self.numlines)) + ] + try: + repgrp_samp_dict[thisrepgrp].append(data_idx) + except: + repgrp_samp_dict[thisrepgrp] = [data_idx] + + if debug: + set_trace() + for irepgrp, repgrp in enumerate(repgrplistuniq): + sub_matrix_list = [] + for data_idx in repgrp_samp_dict[repgrp]: + _row, _col = plot, data_idx + sub_matrix = self.hm.matrix.get_matrix(_row, _col) + sub_matrix_list.append(sub_matrix["matrix"]) + + label = f"{repgrp}(n={len(repgrp_samp_dict[repgrp])})" + + if self.numlines > 1: + coloridx = irepgrp + else: + coloridx = plot + plot_single( + ax, + sub_matrix_list, + self.averagetype, + self.color_list[coloridx], + label, + plot_type=self.plot_type, + ) + else: + for data_idx in range(self.numlines): + if self.per_group: + _row, _col = plot, data_idx + else: + _row, _col = data_idx, plot + if ( + localYMin is None or + self.y_min[col % len(self.y_min)] < localYMin + ): + localYMin = self.y_min[col % len(self.y_min)] + if ( + localYMax is None or + self.y_max[col % len(self.y_max)] > localYMax + ): + localYMax = self.y_max[col % len(self.y_max)] + + sub_matrix = self.hm.matrix.get_matrix(_row, _col) + + if self.per_group: + label = sub_matrix["sample"] + else: + label = sub_matrix["group"] + + if self.numlines > 1: + coloridx = data_idx + else: + coloridx = plot + plot_single( + ax, + sub_matrix["matrix"], + self.averagetype, + self.color_list[coloridx], + label, + plot_type=self.plot_type, + ) + globalYmin = min(np.float64(globalYmin), ax.get_ylim()[0]) globalYmax = max(globalYmax, ax.get_ylim()[1]) # Exclude ticks from all but one subplot by default if col > 0 and len(self.y_min) == 1 and len(self.y_max) == 1: plt.setp(ax.get_yticklabels(), visible=False) - totalWidth = sub_matrix['matrix'].shape[1] + totalWidth = sub_matrix["matrix"].shape[1] xticks, xtickslabel = self.getTicks(tickIdx) if np.ceil(max(xticks)) != float(totalWidth - 1): tickscale = float(totalWidth) / max(xticks) @@ -772,15 +994,19 @@ def plot_profile(self): # such that they don't fall off # the heatmap sides ticks = ax.xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") - if first and self.y_axis_label != '': + if first and self.y_axis_label != "": ax.set_ylabel(self.y_axis_label) - if first and self.plot_type not in ['heatmap', 'overlapped_lines']: - ax.legend(loc=self.legend_location.replace('-', ' '), - ncol=1, prop=self.font_p, - frameon=False, markerscale=0.5) + if first and self.plot_type not in ["heatmap", "overlapped_lines"]: + ax.legend( + loc=self.legend_location.replace("-", " "), + ncol=1, + prop=self.font_p, + frameon=False, + markerscale=0.5, + ) if len(self.y_min) == 1 and len(self.y_max) == 1: first = False ax_list.append(ax) @@ -813,11 +1039,13 @@ def plotly_profile(self): y_min, y_max, and color_list are set already """ fig = go.Figure() - cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + cols = ( + self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + ) rows = np.ceil(self.numplots / float(cols)).astype(int) - fig['layout'].update(title=self.plot_title) - domainWidth = .9 / cols - domainHeight = .9 / rows + fig["layout"].update(title=self.plot_title) + domainWidth = 0.9 / cols + domainHeight = 0.9 / rows bufferHeight = 0.0 if rows > 1: bufferHeight = 0.1 / (rows - 1) @@ -833,22 +1061,42 @@ def plotly_profile(self): row = np.floor(i / self.plots_per_row) # row = rows - i / self.plots_per_row - 1 col = i % self.plots_per_row - xanchor = 'x{}'.format(i + 1) - yanchor = 'y{}'.format(i + 1) + xanchor = "x{}".format(i + 1) + yanchor = "y{}".format(i + 1) base = row * (domainHeight + bufferHeight) domain = [base, base + domainHeight] titleY = base + domainHeight - fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'title': self.y_axis_label, 'anchor': xanchor, 'autorange': False} + fig["layout"]["yaxis{}".format(i + 1)] = { + "domain": domain, + "title": self.y_axis_label, + "anchor": xanchor, + "autorange": False, + } base = col * (domainWidth + bufferWidth) domain = [base, base + domainWidth] titleX = base + 0.5 * domainWidth - fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor} + fig["layout"]["xaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": yanchor, + } if self.per_group: title = self.hm.matrix.group_labels[i] else: title = self.hm.matrix.sample_labels[i] - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False}) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": title, + "y": titleY, + "x": titleX, + "font": {"size": 16}, + "showarrow": False, + } + ) for j in range(self.numlines): if self.per_group: @@ -857,33 +1105,37 @@ def plotly_profile(self): _row, _col = j, i sub_matrix = self.hm.matrix.get_matrix(_row, _col) - fig['layout']['xaxis{}'.format(i + 1)].update(range=[0, sub_matrix['matrix'].shape[1]]) + fig["layout"]["xaxis{}".format(i + 1)].update( + range=[0, sub_matrix["matrix"].shape[1]] + ) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] if self.numlines > 1: coloridx = j else: coloridx = i color = self.color_list[coloridx] - traces = plotly_single(sub_matrix['matrix'], - self.averagetype, - color, - label, - plot_type=self.plot_type) + traces = plotly_single( + sub_matrix["matrix"], + self.averagetype, + color, + label, + plot_type=self.plot_type, + ) for trace in traces: trace.update(xaxis=xanchor, yaxis=yanchor) - if yMin is None or min(trace['y']) < yMin: - yMin = min(trace['y']) - if yMax is None or max(trace['y']) > yMax: - yMax = max(trace['y']) + if yMin is None or min(trace["y"]) < yMin: + yMin = min(trace["y"]) + if yMax is None or max(trace["y"]) > yMax: + yMax = max(trace["y"]) if row == col == 0: traces[0].update(showlegend=True) data.extend(traces) - totalWidth = sub_matrix['matrix'].shape[1] + totalWidth = sub_matrix["matrix"].shape[1] xticks, xtickslabel = self.getTicks(i) if np.ceil(max(xticks)) != float(totalWidth): tickscale = float(totalWidth) / max(xticks) @@ -891,20 +1143,25 @@ def plotly_profile(self): else: xticks_use = xticks xticks_use = [np.ceil(x) for x in xticks_use] - fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation) + fig["layout"]["xaxis{}".format(i + 1)].update( + tickmode="array", + tickvals=xticks_use, + ticktext=xtickslabel, + tickangle=self.label_rotation, + ) # Set the y limits for i in range(self.numplots): - yaxis = 'yaxis{}'.format(i + 1) + yaxis = "yaxis{}".format(i + 1) yRange = [yMin, yMax] if self.y_min[i % len(self.y_min)] is not None: yRange[0] = self.y_min[i % len(self.y_min)] if self.y_max[i % len(self.y_max)] is not None: yRange[1] = self.y_max[i % len(self.y_max)] - fig['layout'][yaxis].update(range=yRange) + fig["layout"][yaxis].update(range=yRange) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=self.out_file_name, auto_open=False) @@ -915,21 +1172,40 @@ def main(args=None): args.matrixFile.close() hm.read_matrix_file(matrix_file) - if hm.parameters['min threshold'] is not None or hm.parameters['max threshold'] is not None: - filterHeatmapValues(hm, hm.parameters['min threshold'], hm.parameters['max threshold']) + if ( + hm.parameters["min threshold"] is not None or + hm.parameters["max threshold"] is not None + ): + filterHeatmapValues( + hm, hm.parameters["min threshold"], hm.parameters["max threshold"] + ) if args.kmeans is not None: - hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples) + hm.matrix.hmcluster( + args.kmeans, method="kmeans", clustering_samples=args.clusterUsingSamples + ) else: if args.hclust is not None: - print("Performing hierarchical clustering." - "Please note that it might be very slow for large datasets.\n") - hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples) - - group_len_ratio = np.diff(hm.matrix.group_boundaries) / float(len(hm.matrix.regions)) + print( + "Performing hierarchical clustering." + "Please note that it might be very slow for large datasets.\n" + ) + hm.matrix.hmcluster( + args.hclust, + method="hierarchical", + clustering_samples=args.clusterUsingSamples, + ) + + group_len_ratio = np.diff(hm.matrix.group_boundaries) / float( + len(hm.matrix.regions) + ) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) - sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. \n".format(hm.matrix.group_labels[problem[0]])) + sys.stderr.write( + "WARNING: Group '{}' is too small for plotting, you might want to remove it. \n".format( + hm.matrix.group_labels[problem[0]] + ) + ) if args.regionsLabel: hm.matrix.set_group_labels(args.regionsLabel) @@ -938,36 +1214,44 @@ def main(args=None): hm.matrix.set_sample_labels(args.samplesLabel) if args.outFileNameData: - hm.save_tabulated_values(args.outFileNameData, reference_point_label=args.refPointLabel, - start_label=args.startLabel, - end_label=args.endLabel, - averagetype=args.averageType) + hm.save_tabulated_values( + args.outFileNameData, + reference_point_label=args.refPointLabel, + start_label=args.startLabel, + end_label=args.endLabel, + averagetype=args.averageType, + ) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) - prof = Profile(hm, args.outFileName, - plot_title=args.plotTitle, - y_axis_label=args.yAxisLabel, - y_min=args.yMin, y_max=args.yMax, - averagetype=args.averageType, - reference_point_label=args.refPointLabel, - start_label=args.startLabel, - end_label=args.endLabel, - plot_height=args.plotHeight, - plot_width=args.plotWidth, - per_group=args.perGroup, - plot_type=args.plotType, - image_format=args.plotFileFormat, - color_list=args.colors, - legend_location=args.legendLocation, - plots_per_row=args.numPlotsPerRow, - label_rotation=args.label_rotation, - dpi=args.dpi) - - if args.plotType == 'heatmap': + prof = Profile( + hm, + args.outFileName, + plot_title=args.plotTitle, + y_axis_label=args.yAxisLabel, + y_min=args.yMin, + y_max=args.yMax, + averagetype=args.averageType, + reference_point_label=args.refPointLabel, + start_label=args.startLabel, + end_label=args.endLabel, + plot_height=args.plotHeight, + plot_width=args.plotWidth, + per_group=args.perGroup, + repgrplist=args.repgrplist, + plot_type=args.plotType, + image_format=args.plotFileFormat, + color_list=args.colors, + legend_location=args.legendLocation, + plots_per_row=args.numPlotsPerRow, + label_rotation=args.label_rotation, + dpi=args.dpi, + ) + + if args.plotType == "heatmap": prof.plot_heatmap() - elif args.plotType == 'overlapped_lines': + elif args.plotType == "overlapped_lines": prof.plot_hexbin() else: prof.plot_profile()