riboseqr_wrapper: riboseqr/triplet.py comparison

comparison riboseqr/triplet.py @ 5:423ad61697c4

Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero). readingFrame function fails with subscript out of bounds. Bugfix 2: [triplet] Check if transcript name in SAM matches the name in FASTA. Produce an error message if it's not. This fixes the problem where an empty plot is produced (no bars). [ribosome_profile] - A proper error message is now produced if an invalid transcript name is provided. Updated test data

author	Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
date	Tue, 27 Oct 2015 12:21:50 +0000
parents	c34c364ce75d
children

comparison

equal deleted inserted replaced

-:b2eb07000039
+:423ad61697c4
 import utils
 rscript = ''
 R = robjects.r
+class TripletPeriodicityError(Exception):
+pass
 def run_rscript(command=None):
 """Run R command, log it, append to rscript"""
 global rscript
 return output
 def find_periodicity(
 rdata_load='Prepare.rda', start_codons='ATG', stop_codons='TAG,TAA,TGA',
-fasta_file=None, include_lengths='25:30', analyze_plot_lengths='26:30',
+fasta_file=None, include_lengths='25:30', analyze_plot_lengths=None,
 text_legend='Frame 0, Frame 1, Frame 2', rdata_save='Periodicity.rda',
 html_file='Periodicity-report.html', output_path=os.getcwd()):
 """Plot triplet periodicity from prepared R data file. """
 logging.debug('{}'.format(R('sessionInfo()')))
 cmd = 'suppressMessages(library(riboSeqR))'
 run_rscript(cmd)
 logging.debug('Loading saved R data file')
 cmd = 'load("{}")'.format(rdata_load)
 run_rscript(cmd)
+# get the first seqname (populated from input file which was generated from
+# SAM)
+refseq = R('levels(riboDat@riboGR$`RiboSeq file 1`@seqnames)[1]')
+refseq = tuple(refseq)[0]
+fasta_valid = False
+for line in open(fasta_file):
+if line.startswith('>'):
+seqname = line.strip().lstrip('>').split()
+# get the FASTA header, split on space, and use the first column to
+# compare. Multiple fields separated by space doesn't appear to be
+# a problem for riboSeqR
+if seqname[0] == refseq:
+fasta_valid = True
+break
+if not fasta_valid:
+raise TripletPeriodicityError(
+'Transcript name in SAM alignment does not match with name in the transcriptome '
+'FASTA header. Please also check if the reads were aligned to this transcriptome '
+'and not the genome')
 # R("""options(showTailLines=Inf)""")
 starts, stops = (utils.process_args(start_codons, ret_mode='charvector'),
 utils.process_args(stop_codons, ret_mode='charvector'))
 logging.debug('Potential coding sequences using start codon (ATG) and '
 'stop codons TAG, TAA, TGA')
 logging.debug('{}\n'.format(R['fastaCDS']))
 cmd = """fCs <- frameCounting(riboDat, fastaCDS, lengths={0})
-fS <- readingFrame(rC=fCs, lengths={1}); fS""".\
-format(include_lengths, analyze_plot_lengths)
+""".format(include_lengths)
+if analyze_plot_lengths:
+cmd += 'fS <- readingFrame(rC=fCs, lengths={0}); fS'.format(analyze_plot_lengths)
+else:
+cmd += 'fS <- readingFrame(rC=fCs); fS'
 run_rscript(cmd)
 logging.debug('riboDat \n{}\n'.format(R['riboDat']))
 logging.debug('fCs\n{0}\n'.format(R['fCs']))
 logging.debug('Reading frames for each n-mer\n{}'.format(R['fS']))
 with open(html_file, 'w') as f:
 f.write(html)
 if __name__ == '__main__':
 parser = argparse.ArgumentParser(
 description='Plot triplet periodicity for different read lengths.')
 # required arguments
 flags = parser.add_argument_group('required arguments')
 flags.add_argument(
 '--rdata_load', required=True,
 help='riboSeqR data from input preparation step (Prepare.rda)')
 flags.add_argument(
 '--fasta_file', required=True,
 help='FASTA file of the reference transcriptome')
 # optional arguments
 parser.add_argument(
 '--start_codons',
 help='Start codon(s) to use. (default: %(default)s)', default='ATG')
 parser.add_argument(
 '--include_lengths',
 help='Lengths of ribosome footprints to be included '
 '(default: %(default)s)', default='25:30')
 parser.add_argument(
 '--analyze_plot_lengths',
-help='Lenghts of reads to be analyzed for frame-shift and plotting '
+help='Lenghts of reads to be analyzed for frame-shift and plotting')
-'(default: %(default)s)', default='26:30')
 parser.add_argument(
 '--text_legend',
 help='Text for legend used in the plot (default: %(default)s)',
 default='Frame 0, Frame 1, Frame 2')
 parser.add_argument(
 parser.add_argument('--html_file', help='Output file for results (HTML)')
 parser.add_argument('--output_path',
 help='Files are saved in this directory')
 parser.add_argument(
 '--debug', help='Produce debug output', action='store_true')
 args = parser.parse_args()
 if args.debug:
 logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s',
 level=logging.DEBUG, stream=sys.stdout)
 logging.debug('Supplied Arguments\n{}\n'.format(vars(args)))

Mercurial > repos > vimalkumarvelayudhan > riboseqr_wrapper

comparison riboseqr/triplet.py @ 5:423ad61697c4