annotate tools/ilmn_pacbio/assembly_stats.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 #Copyright (c) 2011, Pacific Biosciences of California, Inc.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 #All rights reserved.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 #Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 # * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 # * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 # * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 #THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 #WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 #DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 #LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 import sys, os
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 from optparse import OptionParser
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 import pkg_resources
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 pkg_resources.require( 'bx-python' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 from bx.seq.fasta import FastaReader
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 def getStats( fastaFile, genomeLength, minContigLength ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 lengths = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 stats = { "Num" : 0,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 "Sum" : 0,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 "Max" : 0,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 "Avg" : 0,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 "N50" : 0,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 "99%" : 0 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 fasta_reader = FastaReader( open( fastaFile, 'rb' ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 seq = fasta_reader.next()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 if not seq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 if seq.length < minContigLength:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 lengths.append( seq.length )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 if lengths:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 stats[ 'Num' ] = len( lengths )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 stats[ 'Sum' ] = sum( lengths )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 stats[ 'Max' ] = max( lengths )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 stats[ 'Avg' ] = int( sum( lengths ) / float( len( lengths ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 stats[ 'N50' ] = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 stats[ '99%' ] = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 if genomeLength == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 genomeLength = sum( lengths )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 lengths.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 lengths.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 lenSum = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 stats[ "99%" ] = len( lengths )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 for idx, length in enumerate( lengths ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 lenSum += length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 if ( lenSum > genomeLength / 2 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 stats[ "N50" ] = length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 lenSum = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 for idx, length in enumerate( lengths ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 lenSum += length
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 if lenSum > genomeLength * 0.99:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 stats[ "99%" ] = idx + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 return stats
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 #Parse Command Line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 usage = 'Usage: %prog input output --minContigLength'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 parser = OptionParser( usage=usage )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 parser.add_option( "--minContigLength", dest="minContigLength", help="Minimum length of contigs to analyze" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 parser.add_option( "--genomeLength", dest="genomeLength", help="Length of genome for which to calculate N50s" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 parser.set_defaults( minContigLength=0, genomeLength=0 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 options, args = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 input_fasta_file = args[ 0 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 output_tabular_file = args[ 1 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 statKeys = "Num Sum Max Avg N50 99%".split( " " )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 stats = getStats( input_fasta_file, int( options.genomeLength ), int( options.minContigLength ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 fout = open( output_tabular_file, "w" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 fout.write( "%s\n" % "\t".join( map( lambda key: str( stats[ key ] ), statKeys ) ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 fout.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 if __name__=="__main__": __main__()