comparison riboseqr/triplet.py @ 5:423ad61697c4

Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero). readingFrame function fails with subscript out of bounds. Bugfix 2: [triplet] Check if transcript name in SAM matches the name in FASTA. Produce an error message if it's not. This fixes the problem where an empty plot is produced (no bars). [ribosome_profile] - A proper error message is now produced if an invalid transcript name is provided. Updated test data
author Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
date Tue, 27 Oct 2015 12:21:50 +0000
parents c34c364ce75d
children
comparison
equal deleted inserted replaced
2:b2eb07000039 5:423ad61697c4
7 7
8 import utils 8 import utils
9 9
10 rscript = '' 10 rscript = ''
11 R = robjects.r 11 R = robjects.r
12
13
14 class TripletPeriodicityError(Exception):
15 pass
12 16
13 17
14 def run_rscript(command=None): 18 def run_rscript(command=None):
15 """Run R command, log it, append to rscript""" 19 """Run R command, log it, append to rscript"""
16 global rscript 20 global rscript
22 return output 26 return output
23 27
24 28
25 def find_periodicity( 29 def find_periodicity(
26 rdata_load='Prepare.rda', start_codons='ATG', stop_codons='TAG,TAA,TGA', 30 rdata_load='Prepare.rda', start_codons='ATG', stop_codons='TAG,TAA,TGA',
27 fasta_file=None, include_lengths='25:30', analyze_plot_lengths='26:30', 31 fasta_file=None, include_lengths='25:30', analyze_plot_lengths=None,
28 text_legend='Frame 0, Frame 1, Frame 2', rdata_save='Periodicity.rda', 32 text_legend='Frame 0, Frame 1, Frame 2', rdata_save='Periodicity.rda',
29 html_file='Periodicity-report.html', output_path=os.getcwd()): 33 html_file='Periodicity-report.html', output_path=os.getcwd()):
30 """Plot triplet periodicity from prepared R data file. """ 34 """Plot triplet periodicity from prepared R data file. """
31 logging.debug('{}'.format(R('sessionInfo()'))) 35 logging.debug('{}'.format(R('sessionInfo()')))
32 cmd = 'suppressMessages(library(riboSeqR))' 36 cmd = 'suppressMessages(library(riboSeqR))'
33 run_rscript(cmd) 37 run_rscript(cmd)
34 38
35 logging.debug('Loading saved R data file') 39 logging.debug('Loading saved R data file')
36 cmd = 'load("{}")'.format(rdata_load) 40 cmd = 'load("{}")'.format(rdata_load)
37 run_rscript(cmd) 41 run_rscript(cmd)
42
43 # get the first seqname (populated from input file which was generated from
44 # SAM)
45 refseq = R('levels(riboDat@riboGR$`RiboSeq file 1`@seqnames)[1]')
46 refseq = tuple(refseq)[0]
47
48 fasta_valid = False
49 for line in open(fasta_file):
50 if line.startswith('>'):
51 seqname = line.strip().lstrip('>').split()
52 # get the FASTA header, split on space, and use the first column to
53 # compare. Multiple fields separated by space doesn't appear to be
54 # a problem for riboSeqR
55 if seqname[0] == refseq:
56 fasta_valid = True
57 break
58 if not fasta_valid:
59 raise TripletPeriodicityError(
60 'Transcript name in SAM alignment does not match with name in the transcriptome '
61 'FASTA header. Please also check if the reads were aligned to this transcriptome '
62 'and not the genome')
38 63
39 # R("""options(showTailLines=Inf)""") 64 # R("""options(showTailLines=Inf)""")
40 starts, stops = (utils.process_args(start_codons, ret_mode='charvector'), 65 starts, stops = (utils.process_args(start_codons, ret_mode='charvector'),
41 utils.process_args(stop_codons, ret_mode='charvector')) 66 utils.process_args(stop_codons, ret_mode='charvector'))
42 67
47 logging.debug('Potential coding sequences using start codon (ATG) and ' 72 logging.debug('Potential coding sequences using start codon (ATG) and '
48 'stop codons TAG, TAA, TGA') 73 'stop codons TAG, TAA, TGA')
49 logging.debug('{}\n'.format(R['fastaCDS'])) 74 logging.debug('{}\n'.format(R['fastaCDS']))
50 75
51 cmd = """fCs <- frameCounting(riboDat, fastaCDS, lengths={0}) 76 cmd = """fCs <- frameCounting(riboDat, fastaCDS, lengths={0})
52 fS <- readingFrame(rC=fCs, lengths={1}); fS""".\ 77
53 format(include_lengths, analyze_plot_lengths) 78 """.format(include_lengths)
79 if analyze_plot_lengths:
80 cmd += 'fS <- readingFrame(rC=fCs, lengths={0}); fS'.format(analyze_plot_lengths)
81 else:
82 cmd += 'fS <- readingFrame(rC=fCs); fS'
54 run_rscript(cmd) 83 run_rscript(cmd)
55 84
56 logging.debug('riboDat \n{}\n'.format(R['riboDat'])) 85 logging.debug('riboDat \n{}\n'.format(R['riboDat']))
57 logging.debug('fCs\n{0}\n'.format(R['fCs'])) 86 logging.debug('fCs\n{0}\n'.format(R['fCs']))
58 logging.debug('Reading frames for each n-mer\n{}'.format(R['fS'])) 87 logging.debug('Reading frames for each n-mer\n{}'.format(R['fS']))
95 with open(html_file, 'w') as f: 124 with open(html_file, 'w') as f:
96 f.write(html) 125 f.write(html)
97 126
98 127
99 if __name__ == '__main__': 128 if __name__ == '__main__':
100 129
101 parser = argparse.ArgumentParser( 130 parser = argparse.ArgumentParser(
102 description='Plot triplet periodicity for different read lengths.') 131 description='Plot triplet periodicity for different read lengths.')
103 132
104 # required arguments 133 # required arguments
105 flags = parser.add_argument_group('required arguments') 134 flags = parser.add_argument_group('required arguments')
106 flags.add_argument( 135 flags.add_argument(
107 '--rdata_load', required=True, 136 '--rdata_load', required=True,
108 help='riboSeqR data from input preparation step (Prepare.rda)') 137 help='riboSeqR data from input preparation step (Prepare.rda)')
109 flags.add_argument( 138 flags.add_argument(
110 '--fasta_file', required=True, 139 '--fasta_file', required=True,
111 help='FASTA file of the reference transcriptome') 140 help='FASTA file of the reference transcriptome')
112 141
113 # optional arguments 142 # optional arguments
114 parser.add_argument( 143 parser.add_argument(
115 '--start_codons', 144 '--start_codons',
116 help='Start codon(s) to use. (default: %(default)s)', default='ATG') 145 help='Start codon(s) to use. (default: %(default)s)', default='ATG')
117 parser.add_argument( 146 parser.add_argument(
121 '--include_lengths', 150 '--include_lengths',
122 help='Lengths of ribosome footprints to be included ' 151 help='Lengths of ribosome footprints to be included '
123 '(default: %(default)s)', default='25:30') 152 '(default: %(default)s)', default='25:30')
124 parser.add_argument( 153 parser.add_argument(
125 '--analyze_plot_lengths', 154 '--analyze_plot_lengths',
126 help='Lenghts of reads to be analyzed for frame-shift and plotting ' 155 help='Lenghts of reads to be analyzed for frame-shift and plotting')
127 '(default: %(default)s)', default='26:30')
128 parser.add_argument( 156 parser.add_argument(
129 '--text_legend', 157 '--text_legend',
130 help='Text for legend used in the plot (default: %(default)s)', 158 help='Text for legend used in the plot (default: %(default)s)',
131 default='Frame 0, Frame 1, Frame 2') 159 default='Frame 0, Frame 1, Frame 2')
132 parser.add_argument( 160 parser.add_argument(
135 parser.add_argument('--html_file', help='Output file for results (HTML)') 163 parser.add_argument('--html_file', help='Output file for results (HTML)')
136 parser.add_argument('--output_path', 164 parser.add_argument('--output_path',
137 help='Files are saved in this directory') 165 help='Files are saved in this directory')
138 parser.add_argument( 166 parser.add_argument(
139 '--debug', help='Produce debug output', action='store_true') 167 '--debug', help='Produce debug output', action='store_true')
140 168
141 args = parser.parse_args() 169 args = parser.parse_args()
142 if args.debug: 170 if args.debug:
143 logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s', 171 logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s',
144 level=logging.DEBUG, stream=sys.stdout) 172 level=logging.DEBUG, stream=sys.stdout)
145 logging.debug('Supplied Arguments\n{}\n'.format(vars(args))) 173 logging.debug('Supplied Arguments\n{}\n'.format(vars(args)))