Mercurial > repos > schang > frp_tool
view frp_tool.py @ 0:e223b827a13e draft
Uploaded
author | schang |
---|---|
date | Tue, 02 Dec 2014 19:38:14 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ''' Author: Silvia Chang Metagenomic Final Project Fall 2014 Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit Usage: ./frp_tool.py [options] --input --output ''' import matplotlib.pyplot as plt import os, sys, getopt, imp, errno # import shutil from refseq_parser import SequenceFile from SAM_parser import SAMFile from FRHIT_parser import FRHITFile class FragmentPlotter: '''Contains all functions to create fragment recruitment plots''' # Color list. Can only handle 5 samples. Add more colors for more samples. colors = [ (0.78, 0.59, 0.17, 1.0), #dark gold (0.99, 0.60, 0.17, 1.0), #light orange (0.60, 0.82, 0.47, 1.0), #medium/light green (0.95, 0.40, 0.40, 1.0), #dark pink/fuschia (0.17, 0.50, 0.69, 1.0), #medium teal/blue (0.85, 0.64, 0.60, 1.0), #light plum ] def __init__(self, microbiome, pt_color, ymin=50): self.microbiome = microbiome self.decorated = False self.pt_color = pt_color self.ymin = ymin def new_plot(self, ref_name, contigs, begin, length): ''' Clears previous data and start a blank plot''' # refgenome - string, organism name of the refgenome # contigs - a list containing a [name,length] pair of the refgenome # begin, length - integers indicate beginning/length of plot plt.close() # Initialize variables that will contain all the fragments information self.refgenome = ref_name self.contigs = contigs self.begin = begin self.samples = [] self.n_points = 0 self.x_points = [] self.y_points = [] self.clr_points = [] self.decorated = False self.full_length = 0 # Builds the full length of the reference genome, or partial based on begin and length variables for contig in self.contigs: self.full_length += contig[1] # Sets the range of the x axis if length != 0: self.xextent = begin + length else: self.xextent = self.full_length def decorate(self): ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.''' if not self.decorated: self.decorated = True # Sets the rc parameters, add x-axis tickmarks and labels them plt.rc('xtick', labelsize=8) ticks = [self.begin] # beginning tick ticks.append(self.xextent/2) # mid tick ticks.append(self.xextent) # last tick labels = [int(self.begin)] # beginning nucleotide labels.append(str(self.xextent/2)) # mid nucleotide labels.append(str(self.xextent)) # last nucleotide plt.xticks(ticks, labels) # Sets x and y axes boundaries and labels plt.xlim(self.begin, self.xextent) plt.ylim(self.ymin, 100) plt.ylabel("% Identity") # Set plot titles plt.title(self.refgenome) plt.suptitle(self.microbiome) def draw(self, filename): '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name''' # Sets points size and shape based on number of points recruited if self.n_points > 10000: pt_shape = 'o' pt_size = 1 else: pt_shape ='s' pt_size = 3 # Generate the scatter plot # plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none') # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method. plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0]) plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10}) # Adds titles, axes, etc self.decorate() # Save the images plt.savefig(filename, dpi=150, bbox_inches=0) def legend(self, filename, width=4): '''Create a legend for the plot''' plt.figure(figsize=(width, 0.45*len(self.samples))) plt.axis('off') plt.ylim(-1,0.1) i = 0 ystep = 0 for s in self.samples: plt.text(0.00, -1*ystep-0.1, 'text', backgroundcolor=self.colors[i], color=self.colors[i]) plt.text(0.27, -1*ystep-0.1, s) i += 1 ystep += 0.2 plt.title("Samples") plt.savefig(filename, bbox_inches=0) def show(self): self.decorate() plt.show() def add_sample(self, name): self.samples.append(name) def add_points(self, sample, ref_contig, coordinate, identity): x= 0 # Builds the reference length for c in self.contigs: if (ref_contig == c[0]): break x += c[1] # Applies new color to different samples clr = self.pt_color for s in self.samples: if(sample == s): break clr += 1 if x + coordinate >= self.begin and x + coordinate <= self.xextent: self.n_points += 1 self.x_points.append(x + coordinate) self.y_points.append(identity) self.clr_points.append(self.colors[clr]) def usage(): print ('''Usage: %s [options] <aligned_input> <ref_genome> Options -h: display this help message -i <int>: minimum identity to include in plot (y-axis extent) -b <int>: begin the plot at the nth nucleotide in the ref genome -l <int>: include n nucleotides in the x-axis of the plot the default behavior is to plot the entirety of the genome -t <str>: title of the plot (e.g. name of reference genome). Put in quotes. -m <str>: Name of the microbiome. Put in quotes. -c <int>: number indicating color for plot points. ''' % sys.argv[0]) if __name__ == '__main__': # Parse all command-line arguments # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file try: opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color=']) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(2) # Set some default values identity = 50.0 begin = 0 length = 0 title = '' microbiome='' color = 0 # Parse options and set user-defined values for opt, value in opts: if opt in ('-h', '--help'): usage() sys.exit(0) elif opt in ('-i', '--identity'): identity = float(value) elif opt in ('-b', '--begin'): begin = int(value) elif opt in ('-l', '--length'): length = int(length) elif opt in ('-t', '--title'): title = str(value) elif opt in ('-m', '--microbiome'): microbiome = str(value) elif opt in ('-c', '--color'): color = int(value) else: assert False, "invalid option" # Verify there are input files: input and ref_genome if len(args) == 0: print ('Need input files.') usage() sys.exit # Load and read the configuration file. Treats the file as module object # config = imp.load_source('*', args[0]) aligned_input= args[0] ref_genome = args[1] # Instantiate plotting object plotter = FragmentPlotter(microbiome, color, ymin=identity) # Begin the plotting commands, starting each with the reference genome # for refgenome in ref_genome: '''refgenome = [(name, taxonID, file_path), ..., ]''' contigs = [] # Parse the reference genome's FASTA file. fasta = open(ref_genome, 'r') for refseq in SequenceFile(fasta): contigs.append([refseq.name.split('|')[3], len(refseq.seq)]) fasta.close() # Sets the ref genome's accession # as the title if title is not set (i.e. empty) if title == '': title = contigs[0][0] # Initialize the new plot plotter.new_plot(title, contigs, begin, length) count_samp = 0 # for sample in aligned_input: '''sample = [(name, {taxonID:path}), ...,]''' # plotter.add_sample("test") count_samp += 1 # Call the appropriate parser based on sample file extension # file = '%s%s' % (config.basepath, sample[1][refgenome[1]]) # if aligned_input.endswith('.sam'): # parser_func = SAMFile # elif aligned_input.endswith('.fr-hit'): # parser_func = FRHITFile # else: # print('Invalid file type') parser_func = SAMFile sample_name ="" for frag in parser_func(aligned_input): if frag.identity >= identity: sample_name = frag.name.split('.')[0] plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity) plotter.add_sample(sample_name) # Put this inside the sample for-loop to create individual plot per sample plotter.draw('%s.png' % ("frp_image")) # plotter.show() # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot. # Put this inside the sample for-loop to create individual plot per sample # plotter.draw('%s/%s.png' % (config.shortname, refgenome[1])) # plotter.legend('%s/legend.png' % (config.shortname)) # Make a tarball of the resulting PNGs # os.system('tar -cvf %s.tar %s >/dev/null' % # ( config.shortname, config.shortname)) # os.system('gzip %s.tar' % (config.shortname)) # shutil.rmtree(config.shortname)