Mercurial > repos > vimalkumarvelayudhan > riboseqr_wrapper
annotate riboseqr/triplet.py @ 6:5a242f289347 default tip
Merge
| author | Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com> |
|---|---|
| date | Tue, 27 Oct 2015 12:29:39 +0000 |
| parents | 423ad61697c4 |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 import os | |
| 3 import sys | |
| 4 import argparse | |
| 5 import logging | |
| 6 import rpy2.robjects as robjects | |
| 7 | |
| 8 import utils | |
| 9 | |
| 10 rscript = '' | |
| 11 R = robjects.r | |
| 12 | |
| 13 | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
14 class TripletPeriodicityError(Exception): |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
15 pass |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
16 |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
17 |
| 0 | 18 def run_rscript(command=None): |
| 19 """Run R command, log it, append to rscript""" | |
| 20 global rscript | |
| 21 if not command: | |
| 22 return | |
| 23 logging.debug(command) | |
| 24 rscript += '{}\n'.format(command) | |
| 25 output = R(command) | |
| 26 return output | |
| 27 | |
| 28 | |
| 29 def find_periodicity( | |
| 30 rdata_load='Prepare.rda', start_codons='ATG', stop_codons='TAG,TAA,TGA', | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
31 fasta_file=None, include_lengths='25:30', analyze_plot_lengths=None, |
| 0 | 32 text_legend='Frame 0, Frame 1, Frame 2', rdata_save='Periodicity.rda', |
| 33 html_file='Periodicity-report.html', output_path=os.getcwd()): | |
| 34 """Plot triplet periodicity from prepared R data file. """ | |
| 35 logging.debug('{}'.format(R('sessionInfo()'))) | |
| 36 cmd = 'suppressMessages(library(riboSeqR))' | |
| 37 run_rscript(cmd) | |
| 38 | |
| 39 logging.debug('Loading saved R data file') | |
| 40 cmd = 'load("{}")'.format(rdata_load) | |
| 41 run_rscript(cmd) | |
| 42 | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
43 # get the first seqname (populated from input file which was generated from |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
44 # SAM) |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
45 refseq = R('levels(riboDat@riboGR$`RiboSeq file 1`@seqnames)[1]') |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
46 refseq = tuple(refseq)[0] |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
47 |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
48 fasta_valid = False |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
49 for line in open(fasta_file): |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
50 if line.startswith('>'): |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
51 seqname = line.strip().lstrip('>').split() |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
52 # get the FASTA header, split on space, and use the first column to |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
53 # compare. Multiple fields separated by space doesn't appear to be |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
54 # a problem for riboSeqR |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
55 if seqname[0] == refseq: |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
56 fasta_valid = True |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
57 break |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
58 if not fasta_valid: |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
59 raise TripletPeriodicityError( |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
60 'Transcript name in SAM alignment does not match with name in the transcriptome ' |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
61 'FASTA header. Please also check if the reads were aligned to this transcriptome ' |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
62 'and not the genome') |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
63 |
| 0 | 64 # R("""options(showTailLines=Inf)""") |
| 65 starts, stops = (utils.process_args(start_codons, ret_mode='charvector'), | |
| 66 utils.process_args(stop_codons, ret_mode='charvector')) | |
| 67 | |
| 68 cmd = ('fastaCDS <- findCDS(fastaFile={0!r}, startCodon={1}, ' | |
| 69 'stopCodon={2})'.format(fasta_file, starts, stops)) | |
| 70 run_rscript(cmd) | |
| 71 | |
| 72 logging.debug('Potential coding sequences using start codon (ATG) and ' | |
| 73 'stop codons TAG, TAA, TGA') | |
| 74 logging.debug('{}\n'.format(R['fastaCDS'])) | |
| 75 | |
| 76 cmd = """fCs <- frameCounting(riboDat, fastaCDS, lengths={0}) | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
77 |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
78 """.format(include_lengths) |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
79 if analyze_plot_lengths: |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
80 cmd += 'fS <- readingFrame(rC=fCs, lengths={0}); fS'.format(analyze_plot_lengths) |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
81 else: |
|
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
82 cmd += 'fS <- readingFrame(rC=fCs); fS' |
| 0 | 83 run_rscript(cmd) |
| 84 | |
| 85 logging.debug('riboDat \n{}\n'.format(R['riboDat'])) | |
| 86 logging.debug('fCs\n{0}\n'.format(R['fCs'])) | |
| 87 logging.debug('Reading frames for each n-mer\n{}'.format(R['fS'])) | |
| 88 | |
| 89 legend = utils.process_args(text_legend, ret_mode='charvector') | |
| 90 | |
| 91 for fmat in ('pdf', 'png'): | |
| 92 if fmat == 'png': | |
| 93 cmd = '{0}(file="{1}", type="cairo")' | |
| 94 else: | |
| 95 cmd = '{0}(file="{1}")' | |
| 96 run_rscript(cmd.format(fmat, os.path.join( | |
| 97 output_path, '{0}.{1}'.format('Periodicity-plot', fmat)))) | |
| 98 run_rscript('plotFS(fS, legend.text = {0})'.format(legend)) | |
| 99 run_rscript('dev.off()') | |
| 100 | |
| 101 run_rscript('save("fCs", "fS", "riboDat", "fastaCDS", ' | |
| 102 'file="{}", compress=FALSE)'.format(rdata_save)) | |
| 103 | |
| 104 html = '<h2>Triplet periodicity - results</h2><hr>' | |
| 105 html += ('<h4>Results of reading frame analysis</h4>' | |
| 106 '<pre>{}</pre><br>'.format(R['fS'])) | |
| 107 html += ('<p>Lengths used for reading frame analysis - <code>{0}</code>' | |
| 108 '<br>Lengths selected for the plot - <code>{1}</code>' | |
| 109 '</p>'.format(include_lengths, analyze_plot_lengths)) | |
| 110 html += ('<p><img src="Periodicity-plot.png" border="1" ' | |
| 111 'alt="Triplet periodicity plot" />' | |
| 112 '<br><a href="Periodicity-plot.pdf">PDF version</a></p>') | |
| 113 | |
| 114 logging.debug('\n{:#^80}\n{}\n{:#^80}\n'.format( | |
| 115 ' R script for this session ', rscript, ' End R script ')) | |
| 116 | |
| 117 with open(os.path.join(output_path, 'periodicity.R'), 'w') as r: | |
| 118 r.write(rscript) | |
| 119 | |
| 120 html += ('<h4>R script for this session</h4>' | |
| 121 '<p><a href="periodicity.R">periodicity.R</a></p>' | |
| 122 '<p>Next step: <em>Metagene analysis</em></p>') | |
| 123 | |
| 124 with open(html_file, 'w') as f: | |
| 125 f.write(html) | |
| 126 | |
| 127 | |
| 128 if __name__ == '__main__': | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
129 |
| 0 | 130 parser = argparse.ArgumentParser( |
| 131 description='Plot triplet periodicity for different read lengths.') | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
132 |
| 0 | 133 # required arguments |
| 134 flags = parser.add_argument_group('required arguments') | |
| 135 flags.add_argument( | |
| 136 '--rdata_load', required=True, | |
| 137 help='riboSeqR data from input preparation step (Prepare.rda)') | |
| 138 flags.add_argument( | |
| 139 '--fasta_file', required=True, | |
| 140 help='FASTA file of the reference transcriptome') | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
141 |
| 0 | 142 # optional arguments |
| 143 parser.add_argument( | |
| 144 '--start_codons', | |
| 145 help='Start codon(s) to use. (default: %(default)s)', default='ATG') | |
| 146 parser.add_argument( | |
| 147 '--stop_codons', help='Stop codon(s) to use. (default: %(default)s)', | |
| 148 default='TAG, TAA, TGA') | |
| 149 parser.add_argument( | |
| 150 '--include_lengths', | |
| 151 help='Lengths of ribosome footprints to be included ' | |
| 152 '(default: %(default)s)', default='25:30') | |
| 153 parser.add_argument( | |
| 154 '--analyze_plot_lengths', | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
155 help='Lenghts of reads to be analyzed for frame-shift and plotting') |
| 0 | 156 parser.add_argument( |
| 157 '--text_legend', | |
| 158 help='Text for legend used in the plot (default: %(default)s)', | |
| 159 default='Frame 0, Frame 1, Frame 2') | |
| 160 parser.add_argument( | |
| 161 '--rdata_save', help='File to write RData to (default: %(default)s)', | |
| 162 default='Periodicity.rda') | |
| 163 parser.add_argument('--html_file', help='Output file for results (HTML)') | |
| 164 parser.add_argument('--output_path', | |
| 165 help='Files are saved in this directory') | |
| 166 parser.add_argument( | |
| 167 '--debug', help='Produce debug output', action='store_true') | |
|
5
423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
parents:
0
diff
changeset
|
168 |
| 0 | 169 args = parser.parse_args() |
| 170 if args.debug: | |
| 171 logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s', | |
| 172 level=logging.DEBUG, stream=sys.stdout) | |
| 173 logging.debug('Supplied Arguments\n{}\n'.format(vars(args))) | |
| 174 | |
| 175 if not os.path.exists(args.output_path): | |
| 176 os.mkdir(args.output_path) | |
| 177 | |
| 178 find_periodicity( | |
| 179 rdata_load=args.rdata_load, start_codons=args.start_codons, | |
| 180 stop_codons=args.stop_codons, fasta_file=args.fasta_file, | |
| 181 include_lengths=args.include_lengths, | |
| 182 analyze_plot_lengths=args.analyze_plot_lengths, | |
| 183 text_legend=args.text_legend, | |
| 184 rdata_save=args.rdata_save, html_file=args.html_file, | |
| 185 output_path=args.output_path) | |
| 186 logging.debug("Done!") |
