| 
6
 | 
     1 #! /usr/bin/env python
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright INRA-URGI 2009-2010
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software. You can use,
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info".
 | 
| 
 | 
    10 # 
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty and the software's author, the holder of the
 | 
| 
 | 
    14 # economic rights, and the successive licensors have only limited
 | 
| 
 | 
    15 # liability.
 | 
| 
 | 
    16 # 
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading, using, modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean that it is complicated to manipulate, and that also
 | 
| 
 | 
    21 # therefore means that it is reserved for developers and experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or
 | 
| 
 | 
    25 # data to be ensured and, more generally, to use and operate it in the
 | 
| 
 | 
    26 # same conditions as regards security.
 | 
| 
 | 
    27 # 
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 #
 | 
| 
 | 
    31 """Get the size distribution of a Fasta / BED file"""
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 import os
 | 
| 
 | 
    34 from optparse import OptionParser
 | 
| 
 | 
    35 from commons.core.parsing.FastaParser import *
 | 
| 
 | 
    36 from SMART.Java.Python.misc.Progress import *
 | 
| 
 | 
    37 from SMART.Java.Python.misc.RPlotter import *
 | 
| 
 | 
    38 from commons.core.parsing.ParserChooser import ParserChooser
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 def writeCVSfile(outHandler):
 | 
| 
 | 
    42     for pos in range(len(letters)):
 | 
| 
 | 
    43         posTrue = pos +1
 | 
| 
 | 
    44         outHandler.write( "%s;" % (posTrue))
 | 
| 
 | 
    45         for letter in lettersRate:
 | 
| 
 | 
    46             if positionRate[letter].has_key(pos):
 | 
| 
 | 
    47                 outHandler.write("%s=%.2f%s;" %(letter, positionRate[letter][pos], "%"))
 | 
| 
 | 
    48             else:
 | 
| 
 | 
    49                 outHandler.write("%s=0%s;" % (letter, "%"))
 | 
| 
 | 
    50         outHandler.write("\n")
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 if __name__ == "__main__":
 | 
| 
 | 
    53 
 | 
| 
 | 
    54     # parse command line
 | 
| 
 | 
    55     description = "Get Letter Distribution v1.0.1: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]"
 | 
| 
 | 
    56 
 | 
| 
 | 
    57     parser = OptionParser(description = description)
 | 
| 
 | 
    58     parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file to be analyzed [compulsory] [format: file in sequence format given by -f]")
 | 
| 
 | 
    59     parser.add_option("-f", "--format",    dest="format",         action="store",                     type="string", help="format of file [format: sequence file format]")
 | 
| 
 | 
    60     parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in PNG format]")
 | 
| 
 | 
    61     parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
 | 
| 
 | 
    62     parser.add_option("-c", "--csv",       dest="csv",            action="store_true", default=False,                help="write a .csv file [format: bool] [default: false]")
 | 
| 
 | 
    63     parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                help="write a log file [format: bool] [default: false]")
 | 
| 
 | 
    64     (options, args) = parser.parse_args()
 | 
| 
 | 
    65 
 | 
| 
 | 
    66     chooser = ParserChooser()
 | 
| 
 | 
    67     chooser.findFormat(options.format)    
 | 
| 
 | 
    68     parser      = chooser.getParser(options.inputFileName)
 | 
| 
 | 
    69     nbSequences = parser.getNbSequences()
 | 
| 
 | 
    70     print "%i sequences read" % (nbSequences)
 | 
| 
 | 
    71 
 | 
| 
 | 
    72     # treat items
 | 
| 
 | 
    73     progress       = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity)
 | 
| 
 | 
    74     nbLettersTotal = 0
 | 
| 
 | 
    75     nbLetters      = {}
 | 
| 
 | 
    76     lettersRate    = {}
 | 
| 
 | 
    77     nbPositions    = {}
 | 
| 
 | 
    78     positionCount  = {}
 | 
| 
 | 
    79     positionRate   = {}
 | 
| 
 | 
    80     nbPositionRate = {}
 | 
| 
 | 
    81     for sequence in parser.getIterator():
 | 
| 
 | 
    82         letters            = sequence.getSequence()
 | 
| 
 | 
    83         thisNbLettersTotal = sequence.getSize()
 | 
| 
 | 
    84         nbLettersTotal    += thisNbLettersTotal
 | 
| 
 | 
    85         thisNbLetters      = {}
 | 
| 
 | 
    86         
 | 
| 
 | 
    87         for pos in range(len(letters)):
 | 
| 
 | 
    88             letter = letters[pos]
 | 
| 
 | 
    89             if letter not in thisNbLetters:
 | 
| 
 | 
    90                 thisNbLetters[letter] = 1
 | 
| 
 | 
    91             else:
 | 
| 
 | 
    92                 thisNbLetters[letter] += 1
 | 
| 
 | 
    93             if pos+1 not in nbPositions:
 | 
| 
 | 
    94                 nbPositions[pos+1] = 1
 | 
| 
 | 
    95             else:
 | 
| 
 | 
    96                 nbPositions[pos+1] += 1
 | 
| 
 | 
    97             if letter not in positionCount:
 | 
| 
 | 
    98                 positionCount[letter] = {}
 | 
| 
 | 
    99             if pos+1 not in positionCount[letter]:
 | 
| 
 | 
   100                 positionCount[letter][pos+1] = 1
 | 
| 
 | 
   101             else:
 | 
| 
 | 
   102                 positionCount[letter][pos+1] += 1
 | 
| 
 | 
   103 
 | 
| 
 | 
   104         for letter in thisNbLetters:
 | 
| 
 | 
   105             if letter not in nbLetters:
 | 
| 
 | 
   106                 nbLetters[letter] = thisNbLetters[letter]
 | 
| 
 | 
   107             else:
 | 
| 
 | 
   108                 nbLetters[letter] += thisNbLetters[letter]
 | 
| 
 | 
   109             if letter not in lettersRate:
 | 
| 
 | 
   110                 lettersRate[letter] = {}
 | 
| 
 | 
   111             rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100)
 | 
| 
 | 
   112             if rate not in lettersRate[letter]:
 | 
| 
 | 
   113                 lettersRate[letter][rate] = 1
 | 
| 
 | 
   114             else:
 | 
| 
 | 
   115                 lettersRate[letter][rate] += 1
 | 
| 
 | 
   116         progress.inc()
 | 
| 
 | 
   117     progress.done()
 | 
| 
 | 
   118     
 | 
| 
 | 
   119     for letter in positionCount:
 | 
| 
 | 
   120         positionRate[letter] = {}
 | 
| 
 | 
   121         for pos in positionCount[letter]:
 | 
| 
 | 
   122             positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100
 | 
| 
 | 
   123     for pos in nbPositions:
 | 
| 
 | 
   124         nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[1]) * 100
 | 
| 
 | 
   125 
 | 
| 
 | 
   126     # plot content distributions
 | 
| 
 | 
   127     plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True)
 | 
| 
 | 
   128     plotter.setFill(0)
 | 
| 
 | 
   129     plotter.setLegend(True)
 | 
| 
 | 
   130     for letter in lettersRate:
 | 
| 
 | 
   131         plotter.addLine(lettersRate[letter], letter)
 | 
| 
 | 
   132     plotter.plot()
 | 
| 
 | 
   133     
 | 
| 
 | 
   134     # plot distribution per position
 | 
| 
 | 
   135     plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True)
 | 
| 
 | 
   136     plotter.setFill(0)
 | 
| 
 | 
   137     plotter.setLegend(True)
 | 
| 
 | 
   138     plotter.setXLabel("Position on the read")
 | 
| 
 | 
   139     plotter.setYLabel("Percentage")
 | 
| 
 | 
   140     for letter in positionRate:
 | 
| 
 | 
   141         plotter.addLine(positionRate[letter], letter)
 | 
| 
 | 
   142     plotter.addLine(nbPositionRate, "#")
 | 
| 
 | 
   143     plotter.plot()
 | 
| 
 | 
   144 
 | 
| 
 | 
   145     if options.csv:
 | 
| 
 | 
   146         outHandler = open("%s.csv" % (options.outputFileName), "w")
 | 
| 
 | 
   147         writeCVSfile(outHandler)
 | 
| 
 | 
   148         outHandler.close() 
 | 
| 
 | 
   149  
 | 
| 
 | 
   150     print "%d sequences" % (nbSequences)
 | 
| 
 | 
   151     print "%d letters" % (nbLettersTotal)
 | 
| 
 | 
   152     for letter in nbLetters:
 | 
| 
 | 
   153         print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100)
 |