# HG changeset patch # User schang # Date 1417567997 18000 # Node ID 348ffb045343d8556457cf76369bb36cf07b18aa # Parent cbdcc962173097b13bec76e2cdb74027b01da31f Uploaded diff -r cbdcc9621730 -r 348ffb045343 frp_tool.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/frp_tool.py Tue Dec 02 19:53:17 2014 -0500 @@ -0,0 +1,300 @@ +#!/usr/bin/env python + +''' +Author: Silvia Chang +Metagenomic Final Project +Fall 2014 + +Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs +Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit + +Usage: ./frp_tool.py [options] --input --output + +''' +import matplotlib.pyplot as plt +import os, sys, getopt, imp, errno +# import shutil +from refseq_parser import SequenceFile +from SAM_parser import SAMFile +from FRHIT_parser import FRHITFile + +class FragmentPlotter: + '''Contains all functions to create fragment recruitment plots''' + + # Color list. Can only handle 5 samples. Add more colors for more samples. + colors = [ + (0.78, 0.59, 0.17, 1.0), #dark gold + (0.99, 0.60, 0.17, 1.0), #light orange + (0.60, 0.82, 0.47, 1.0), #medium/light green + (0.95, 0.40, 0.40, 1.0), #dark pink/fuschia + (0.17, 0.50, 0.69, 1.0), #medium teal/blue + (0.85, 0.64, 0.60, 1.0), #light plum + ] + + def __init__(self, microbiome, pt_color, ymin=50): + self.microbiome = microbiome + self.decorated = False + self.pt_color = pt_color + self.ymin = ymin + + def new_plot(self, ref_name, contigs, begin, length): + ''' Clears previous data and start a blank plot''' + # refgenome - string, organism name of the refgenome + # contigs - a list containing a [name,length] pair of the refgenome + # begin, length - integers indicate beginning/length of plot + + plt.close() + # Initialize variables that will contain all the fragments information + self.refgenome = ref_name + self.contigs = contigs + self.begin = begin + self.samples = [] + self.n_points = 0 + self.x_points = [] + self.y_points = [] + self.clr_points = [] + self.decorated = False + + self.full_length = 0 + + # Builds the full length of the reference genome, or partial based on begin and length variables + for contig in self.contigs: + self.full_length += contig[1] + + # Sets the range of the x axis + if length != 0: + self.xextent = begin + length + else: + self.xextent = self.full_length + + def decorate(self): + ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.''' + if not self.decorated: + self.decorated = True + + # Sets the rc parameters, add x-axis tickmarks and labels them + plt.rc('xtick', labelsize=8) + ticks = [self.begin] # beginning tick + ticks.append(self.xextent/2) # mid tick + ticks.append(self.xextent) # last tick + + labels = [int(self.begin)] # beginning nucleotide + labels.append(str(self.xextent/2)) # mid nucleotide + labels.append(str(self.xextent)) # last nucleotide + + plt.xticks(ticks, labels) + + # Sets x and y axes boundaries and labels + plt.xlim(self.begin, self.xextent) + plt.ylim(self.ymin, 100) + plt.ylabel("% Identity") + + # Set plot titles + plt.title(self.refgenome) + plt.suptitle(self.microbiome) + + def draw(self, filename): + '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name''' + + # Sets points size and shape based on number of points recruited + if self.n_points > 10000: + pt_shape = 'o' + pt_size = 1 + else: + pt_shape ='s' + pt_size = 3 + + # Generate the scatter plot +# plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none') + + # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method. + plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0]) + plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10}) + + # Adds titles, axes, etc + self.decorate() + + # Save the images + plt.savefig(filename, dpi=150, bbox_inches=0) + + def legend(self, filename, width=4): + '''Create a legend for the plot''' + plt.figure(figsize=(width, 0.45*len(self.samples))) + plt.axis('off') + plt.ylim(-1,0.1) + + i = 0 + ystep = 0 + for s in self.samples: + plt.text(0.00, -1*ystep-0.1, 'text', + backgroundcolor=self.colors[i], color=self.colors[i]) + plt.text(0.27, -1*ystep-0.1, s) + i += 1 + ystep += 0.2 + + plt.title("Samples") + plt.savefig(filename, bbox_inches=0) + + def show(self): + self.decorate() + plt.show() + + def add_sample(self, name): + self.samples.append(name) + + def add_points(self, sample, ref_contig, coordinate, identity): + x= 0 + + # Builds the reference length + for c in self.contigs: + if (ref_contig == c[0]): + break + x += c[1] + + # Applies new color to different samples + clr = self.pt_color + for s in self.samples: + if(sample == s): + break + clr += 1 + + if x + coordinate >= self.begin and x + coordinate <= self.xextent: + self.n_points += 1 + self.x_points.append(x + coordinate) + self.y_points.append(identity) + self.clr_points.append(self.colors[clr]) + +def usage(): + print ('''Usage: + %s [options] + Options + -h: display this help message + -i : minimum identity to include in plot (y-axis extent) + -b : begin the plot at the nth nucleotide in the ref genome + -l : include n nucleotides in the x-axis of the plot + the default behavior is to plot the entirety of the genome + -t : title of the plot (e.g. name of reference genome). Put in quotes. + -m : Name of the microbiome. Put in quotes. + -c : number indicating color for plot points. + ''' % sys.argv[0]) + +if __name__ == '__main__': + # Parse all command-line arguments + # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file + try: + opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color=']) + except getopt.GetoptError as err: + print(str(err)) + usage() + sys.exit(2) + + # Set some default values + identity = 50.0 + begin = 0 + length = 0 + title = '' + microbiome='' + color = 0 + + # Parse options and set user-defined values + for opt, value in opts: + if opt in ('-h', '--help'): + usage() + sys.exit(0) + elif opt in ('-i', '--identity'): + identity = float(value) + elif opt in ('-b', '--begin'): + begin = int(value) + elif opt in ('-l', '--length'): + length = int(length) + elif opt in ('-t', '--title'): + title = str(value) + elif opt in ('-m', '--microbiome'): + microbiome = str(value) + elif opt in ('-c', '--color'): + color = int(value) + else: + assert False, "invalid option" + + # Verify there are input files: input and ref_genome + if len(args) == 0: + print ('Need input files.') + usage() + sys.exit + + # Load and read the configuration file. Treats the file as module object +# config = imp.load_source('*', args[0]) + + aligned_input= args[0] + ref_genome = args[1] + + # Instantiate plotting object + plotter = FragmentPlotter(microbiome, color, ymin=identity) + + + # Begin the plotting commands, starting each with the reference genome +# for refgenome in ref_genome: + '''refgenome = [(name, taxonID, file_path), ..., ]''' + contigs = [] + + # Parse the reference genome's FASTA file. + fasta = open(ref_genome, 'r') + for refseq in SequenceFile(fasta): + contigs.append([refseq.name.split('|')[3], len(refseq.seq)]) + + fasta.close() + + # Sets the ref genome's accession # as the title if title is not set (i.e. empty) + if title == '': + title = contigs[0][0] + + # Initialize the new plot + plotter.new_plot(title, contigs, begin, length) + + count_samp = 0 +# for sample in aligned_input: + '''sample = [(name, {taxonID:path}), ...,]''' +# plotter.add_sample("test") + count_samp += 1 + + # Call the appropriate parser based on sample file extension +# file = '%s%s' % (config.basepath, sample[1][refgenome[1]]) + +# if aligned_input.endswith('.sam'): +# parser_func = SAMFile +# elif aligned_input.endswith('.fr-hit'): +# parser_func = FRHITFile +# else: +# print('Invalid file type') + + parser_func = SAMFile + + sample_name ="" + for frag in parser_func(aligned_input): + if frag.identity >= identity: + sample_name = frag.name.split('.')[0] + plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity) + + plotter.add_sample(sample_name) + # Put this inside the sample for-loop to create individual plot per sample + plotter.draw('%s.png' % ("frp_image")) + + +# plotter.show() + + # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot. + # Put this inside the sample for-loop to create individual plot per sample +# plotter.draw('%s/%s.png' % (config.shortname, refgenome[1])) + +# plotter.legend('%s/legend.png' % (config.shortname)) + + # Make a tarball of the resulting PNGs +# os.system('tar -cvf %s.tar %s >/dev/null' % +# ( config.shortname, config.shortname)) +# os.system('gzip %s.tar' % (config.shortname)) +# shutil.rmtree(config.shortname) + + + + + \ No newline at end of file