Mercurial > repos > schang > frp_tool
comparison frp_tool.py @ 0:e223b827a13e draft
Uploaded
| author | schang |
|---|---|
| date | Tue, 02 Dec 2014 19:38:14 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e223b827a13e |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ''' | |
| 4 Author: Silvia Chang | |
| 5 Metagenomic Final Project | |
| 6 Fall 2014 | |
| 7 | |
| 8 Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs | |
| 9 Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit | |
| 10 | |
| 11 Usage: ./frp_tool.py [options] --input --output | |
| 12 | |
| 13 ''' | |
| 14 import matplotlib.pyplot as plt | |
| 15 import os, sys, getopt, imp, errno | |
| 16 # import shutil | |
| 17 from refseq_parser import SequenceFile | |
| 18 from SAM_parser import SAMFile | |
| 19 from FRHIT_parser import FRHITFile | |
| 20 | |
| 21 class FragmentPlotter: | |
| 22 '''Contains all functions to create fragment recruitment plots''' | |
| 23 | |
| 24 # Color list. Can only handle 5 samples. Add more colors for more samples. | |
| 25 colors = [ | |
| 26 (0.78, 0.59, 0.17, 1.0), #dark gold | |
| 27 (0.99, 0.60, 0.17, 1.0), #light orange | |
| 28 (0.60, 0.82, 0.47, 1.0), #medium/light green | |
| 29 (0.95, 0.40, 0.40, 1.0), #dark pink/fuschia | |
| 30 (0.17, 0.50, 0.69, 1.0), #medium teal/blue | |
| 31 (0.85, 0.64, 0.60, 1.0), #light plum | |
| 32 ] | |
| 33 | |
| 34 def __init__(self, microbiome, pt_color, ymin=50): | |
| 35 self.microbiome = microbiome | |
| 36 self.decorated = False | |
| 37 self.pt_color = pt_color | |
| 38 self.ymin = ymin | |
| 39 | |
| 40 def new_plot(self, ref_name, contigs, begin, length): | |
| 41 ''' Clears previous data and start a blank plot''' | |
| 42 # refgenome - string, organism name of the refgenome | |
| 43 # contigs - a list containing a [name,length] pair of the refgenome | |
| 44 # begin, length - integers indicate beginning/length of plot | |
| 45 | |
| 46 plt.close() | |
| 47 # Initialize variables that will contain all the fragments information | |
| 48 self.refgenome = ref_name | |
| 49 self.contigs = contigs | |
| 50 self.begin = begin | |
| 51 self.samples = [] | |
| 52 self.n_points = 0 | |
| 53 self.x_points = [] | |
| 54 self.y_points = [] | |
| 55 self.clr_points = [] | |
| 56 self.decorated = False | |
| 57 | |
| 58 self.full_length = 0 | |
| 59 | |
| 60 # Builds the full length of the reference genome, or partial based on begin and length variables | |
| 61 for contig in self.contigs: | |
| 62 self.full_length += contig[1] | |
| 63 | |
| 64 # Sets the range of the x axis | |
| 65 if length != 0: | |
| 66 self.xextent = begin + length | |
| 67 else: | |
| 68 self.xextent = self.full_length | |
| 69 | |
| 70 def decorate(self): | |
| 71 ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.''' | |
| 72 if not self.decorated: | |
| 73 self.decorated = True | |
| 74 | |
| 75 # Sets the rc parameters, add x-axis tickmarks and labels them | |
| 76 plt.rc('xtick', labelsize=8) | |
| 77 ticks = [self.begin] # beginning tick | |
| 78 ticks.append(self.xextent/2) # mid tick | |
| 79 ticks.append(self.xextent) # last tick | |
| 80 | |
| 81 labels = [int(self.begin)] # beginning nucleotide | |
| 82 labels.append(str(self.xextent/2)) # mid nucleotide | |
| 83 labels.append(str(self.xextent)) # last nucleotide | |
| 84 | |
| 85 plt.xticks(ticks, labels) | |
| 86 | |
| 87 # Sets x and y axes boundaries and labels | |
| 88 plt.xlim(self.begin, self.xextent) | |
| 89 plt.ylim(self.ymin, 100) | |
| 90 plt.ylabel("% Identity") | |
| 91 | |
| 92 # Set plot titles | |
| 93 plt.title(self.refgenome) | |
| 94 plt.suptitle(self.microbiome) | |
| 95 | |
| 96 def draw(self, filename): | |
| 97 '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name''' | |
| 98 | |
| 99 # Sets points size and shape based on number of points recruited | |
| 100 if self.n_points > 10000: | |
| 101 pt_shape = 'o' | |
| 102 pt_size = 1 | |
| 103 else: | |
| 104 pt_shape ='s' | |
| 105 pt_size = 3 | |
| 106 | |
| 107 # Generate the scatter plot | |
| 108 # plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none') | |
| 109 | |
| 110 # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method. | |
| 111 plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0]) | |
| 112 plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10}) | |
| 113 | |
| 114 # Adds titles, axes, etc | |
| 115 self.decorate() | |
| 116 | |
| 117 # Save the images | |
| 118 plt.savefig(filename, dpi=150, bbox_inches=0) | |
| 119 | |
| 120 def legend(self, filename, width=4): | |
| 121 '''Create a legend for the plot''' | |
| 122 plt.figure(figsize=(width, 0.45*len(self.samples))) | |
| 123 plt.axis('off') | |
| 124 plt.ylim(-1,0.1) | |
| 125 | |
| 126 i = 0 | |
| 127 ystep = 0 | |
| 128 for s in self.samples: | |
| 129 plt.text(0.00, -1*ystep-0.1, 'text', | |
| 130 backgroundcolor=self.colors[i], color=self.colors[i]) | |
| 131 plt.text(0.27, -1*ystep-0.1, s) | |
| 132 i += 1 | |
| 133 ystep += 0.2 | |
| 134 | |
| 135 plt.title("Samples") | |
| 136 plt.savefig(filename, bbox_inches=0) | |
| 137 | |
| 138 def show(self): | |
| 139 self.decorate() | |
| 140 plt.show() | |
| 141 | |
| 142 def add_sample(self, name): | |
| 143 self.samples.append(name) | |
| 144 | |
| 145 def add_points(self, sample, ref_contig, coordinate, identity): | |
| 146 x= 0 | |
| 147 | |
| 148 # Builds the reference length | |
| 149 for c in self.contigs: | |
| 150 if (ref_contig == c[0]): | |
| 151 break | |
| 152 x += c[1] | |
| 153 | |
| 154 # Applies new color to different samples | |
| 155 clr = self.pt_color | |
| 156 for s in self.samples: | |
| 157 if(sample == s): | |
| 158 break | |
| 159 clr += 1 | |
| 160 | |
| 161 if x + coordinate >= self.begin and x + coordinate <= self.xextent: | |
| 162 self.n_points += 1 | |
| 163 self.x_points.append(x + coordinate) | |
| 164 self.y_points.append(identity) | |
| 165 self.clr_points.append(self.colors[clr]) | |
| 166 | |
| 167 def usage(): | |
| 168 print ('''Usage: | |
| 169 %s [options] <aligned_input> <ref_genome> | |
| 170 Options | |
| 171 -h: display this help message | |
| 172 -i <int>: minimum identity to include in plot (y-axis extent) | |
| 173 -b <int>: begin the plot at the nth nucleotide in the ref genome | |
| 174 -l <int>: include n nucleotides in the x-axis of the plot | |
| 175 the default behavior is to plot the entirety of the genome | |
| 176 -t <str>: title of the plot (e.g. name of reference genome). Put in quotes. | |
| 177 -m <str>: Name of the microbiome. Put in quotes. | |
| 178 -c <int>: number indicating color for plot points. | |
| 179 ''' % sys.argv[0]) | |
| 180 | |
| 181 if __name__ == '__main__': | |
| 182 # Parse all command-line arguments | |
| 183 # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file | |
| 184 try: | |
| 185 opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color=']) | |
| 186 except getopt.GetoptError as err: | |
| 187 print(str(err)) | |
| 188 usage() | |
| 189 sys.exit(2) | |
| 190 | |
| 191 # Set some default values | |
| 192 identity = 50.0 | |
| 193 begin = 0 | |
| 194 length = 0 | |
| 195 title = '' | |
| 196 microbiome='' | |
| 197 color = 0 | |
| 198 | |
| 199 # Parse options and set user-defined values | |
| 200 for opt, value in opts: | |
| 201 if opt in ('-h', '--help'): | |
| 202 usage() | |
| 203 sys.exit(0) | |
| 204 elif opt in ('-i', '--identity'): | |
| 205 identity = float(value) | |
| 206 elif opt in ('-b', '--begin'): | |
| 207 begin = int(value) | |
| 208 elif opt in ('-l', '--length'): | |
| 209 length = int(length) | |
| 210 elif opt in ('-t', '--title'): | |
| 211 title = str(value) | |
| 212 elif opt in ('-m', '--microbiome'): | |
| 213 microbiome = str(value) | |
| 214 elif opt in ('-c', '--color'): | |
| 215 color = int(value) | |
| 216 else: | |
| 217 assert False, "invalid option" | |
| 218 | |
| 219 # Verify there are input files: input and ref_genome | |
| 220 if len(args) == 0: | |
| 221 print ('Need input files.') | |
| 222 usage() | |
| 223 sys.exit | |
| 224 | |
| 225 # Load and read the configuration file. Treats the file as module object | |
| 226 # config = imp.load_source('*', args[0]) | |
| 227 | |
| 228 aligned_input= args[0] | |
| 229 ref_genome = args[1] | |
| 230 | |
| 231 # Instantiate plotting object | |
| 232 plotter = FragmentPlotter(microbiome, color, ymin=identity) | |
| 233 | |
| 234 | |
| 235 # Begin the plotting commands, starting each with the reference genome | |
| 236 # for refgenome in ref_genome: | |
| 237 '''refgenome = [(name, taxonID, file_path), ..., ]''' | |
| 238 contigs = [] | |
| 239 | |
| 240 # Parse the reference genome's FASTA file. | |
| 241 fasta = open(ref_genome, 'r') | |
| 242 for refseq in SequenceFile(fasta): | |
| 243 contigs.append([refseq.name.split('|')[3], len(refseq.seq)]) | |
| 244 | |
| 245 fasta.close() | |
| 246 | |
| 247 # Sets the ref genome's accession # as the title if title is not set (i.e. empty) | |
| 248 if title == '': | |
| 249 title = contigs[0][0] | |
| 250 | |
| 251 # Initialize the new plot | |
| 252 plotter.new_plot(title, contigs, begin, length) | |
| 253 | |
| 254 count_samp = 0 | |
| 255 # for sample in aligned_input: | |
| 256 '''sample = [(name, {taxonID:path}), ...,]''' | |
| 257 # plotter.add_sample("test") | |
| 258 count_samp += 1 | |
| 259 | |
| 260 # Call the appropriate parser based on sample file extension | |
| 261 # file = '%s%s' % (config.basepath, sample[1][refgenome[1]]) | |
| 262 | |
| 263 # if aligned_input.endswith('.sam'): | |
| 264 # parser_func = SAMFile | |
| 265 # elif aligned_input.endswith('.fr-hit'): | |
| 266 # parser_func = FRHITFile | |
| 267 # else: | |
| 268 # print('Invalid file type') | |
| 269 | |
| 270 parser_func = SAMFile | |
| 271 | |
| 272 sample_name ="" | |
| 273 for frag in parser_func(aligned_input): | |
| 274 if frag.identity >= identity: | |
| 275 sample_name = frag.name.split('.')[0] | |
| 276 plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity) | |
| 277 | |
| 278 plotter.add_sample(sample_name) | |
| 279 # Put this inside the sample for-loop to create individual plot per sample | |
| 280 plotter.draw('%s.png' % ("frp_image")) | |
| 281 | |
| 282 | |
| 283 # plotter.show() | |
| 284 | |
| 285 # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot. | |
| 286 # Put this inside the sample for-loop to create individual plot per sample | |
| 287 # plotter.draw('%s/%s.png' % (config.shortname, refgenome[1])) | |
| 288 | |
| 289 # plotter.legend('%s/legend.png' % (config.shortname)) | |
| 290 | |
| 291 # Make a tarball of the resulting PNGs | |
| 292 # os.system('tar -cvf %s.tar %s >/dev/null' % | |
| 293 # ( config.shortname, config.shortname)) | |
| 294 # os.system('gzip %s.tar' % (config.shortname)) | |
| 295 # shutil.rmtree(config.shortname) | |
| 296 | |
| 297 | |
| 298 | |
| 299 | |
| 300 |
