0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os, sys, math, tempfile, zipfile, re
|
|
4 from rpy import *
|
|
5
|
|
6 assert sys.version_info[:2] >= ( 2, 4 )
|
|
7
|
|
8 def stop_err( msg ):
|
|
9 sys.stderr.write( "%s\n" % msg )
|
|
10 sys.exit()
|
|
11
|
|
12 def unzip( filename ):
|
|
13 zip_file = zipfile.ZipFile( filename, 'r' )
|
|
14 tmpfilename = tempfile.NamedTemporaryFile().name
|
|
15 for name in zip_file.namelist():
|
|
16 file( tmpfilename, 'a' ).write( zip_file.read( name ) )
|
|
17 zip_file.close()
|
|
18 return tmpfilename
|
|
19
|
|
20 def __main__():
|
|
21 infile_score_name = sys.argv[1].strip()
|
|
22 outfile_R_name = sys.argv[2].strip()
|
|
23
|
|
24 try:
|
|
25 score_threshold = int( sys.argv[3].strip() )
|
|
26 except:
|
|
27 stop_err( 'Threshold for quality score must be numerical.' )
|
|
28
|
|
29 infile_is_zipped = False
|
|
30 if zipfile.is_zipfile( infile_score_name ):
|
|
31 infile_is_zipped = True
|
|
32 infile_name = unzip( infile_score_name )
|
|
33 else:
|
|
34 infile_name = infile_score_name
|
|
35
|
|
36 # detect whether it's tabular or fasta format
|
|
37 seq_method = None
|
|
38 data_type = None
|
|
39 for i, line in enumerate( file( infile_name ) ):
|
|
40 line = line.rstrip( '\r\n' )
|
|
41 if not line or line.startswith( '#' ):
|
|
42 continue
|
|
43 if data_type == None:
|
|
44 if line.startswith( '>' ):
|
|
45 data_type = 'fasta'
|
|
46 continue
|
|
47 elif len( line.split( '\t' ) ) > 0:
|
|
48 fields = line.split()
|
|
49 for score in fields:
|
|
50 try:
|
|
51 int( score )
|
|
52 data_type = 'tabular'
|
|
53 seq_method = 'solexa'
|
|
54 break
|
|
55 except:
|
|
56 break
|
|
57 elif data_type == 'fasta':
|
|
58 fields = line.split()
|
|
59 for score in fields:
|
|
60 try:
|
|
61 int( score )
|
|
62 seq_method = '454'
|
|
63 break
|
|
64 except:
|
|
65 break
|
|
66 if i == 100:
|
|
67 break
|
|
68
|
|
69 if data_type is None:
|
|
70 stop_err( 'This tool can only use fasta data or tabular data.' )
|
|
71 if seq_method is None:
|
|
72 stop_err( 'Invalid data for fasta format.')
|
|
73
|
|
74 cont_high_quality = []
|
|
75 invalid_lines = 0
|
|
76 invalid_scores = 0
|
|
77 if seq_method == 'solexa':
|
|
78 for i, line in enumerate( open( infile_name ) ):
|
|
79 line = line.rstrip( '\r\n' )
|
|
80 if not line or line.startswith( '#' ):
|
|
81 continue
|
|
82 locs = line.split( '\t' )
|
|
83 for j, base in enumerate( locs ):
|
|
84 nuc_errors = base.split()
|
|
85 try:
|
|
86 nuc_errors[0] = int( nuc_errors[0] )
|
|
87 nuc_errors[1] = int( nuc_errors[1] )
|
|
88 nuc_errors[2] = int( nuc_errors[2] )
|
|
89 nuc_errors[3] = int( nuc_errors[3] )
|
|
90 big = max( nuc_errors )
|
|
91 except:
|
|
92 invalid_scores += 1
|
|
93 big = 0
|
|
94 if j == 0:
|
|
95 cont_high_quality.append(1)
|
|
96 else:
|
|
97 if big >= score_threshold:
|
|
98 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
|
|
99 else:
|
|
100 cont_high_quality.append(1)
|
|
101 else: # seq_method == '454'
|
|
102 tmp_score = ''
|
|
103 for i, line in enumerate( open( infile_name ) ):
|
|
104 line = line.rstrip( '\r\n' )
|
|
105 if not line or line.startswith( '#' ):
|
|
106 continue
|
|
107 if line.startswith( '>' ):
|
|
108 if len( tmp_score ) > 0:
|
|
109 locs = tmp_score.split()
|
|
110 for j, base in enumerate( locs ):
|
|
111 try:
|
|
112 base = int( base )
|
|
113 except:
|
|
114 invalid_scores += 1
|
|
115 base = 0
|
|
116 if j == 0:
|
|
117 cont_high_quality.append(1)
|
|
118 else:
|
|
119 if base >= score_threshold:
|
|
120 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
|
|
121 else:
|
|
122 cont_high_quality.append(1)
|
|
123 tmp_score = ''
|
|
124 else:
|
|
125 tmp_score = "%s %s" % ( tmp_score, line )
|
|
126 if len( tmp_score ) > 0:
|
|
127 locs = tmp_score.split()
|
|
128 for j, base in enumerate( locs ):
|
|
129 try:
|
|
130 base = int( base )
|
|
131 except:
|
|
132 invalid_scores += 1
|
|
133 base = 0
|
|
134 if j == 0:
|
|
135 cont_high_quality.append(1)
|
|
136 else:
|
|
137 if base >= score_threshold:
|
|
138 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
|
|
139 else:
|
|
140 cont_high_quality.append(1)
|
|
141
|
|
142 # generate pdf figures
|
|
143 cont_high_quality = array ( cont_high_quality )
|
|
144 outfile_R_pdf = outfile_R_name
|
|
145 r.pdf( outfile_R_pdf )
|
|
146 title = "Histogram of continuous high quality scores"
|
|
147 xlim_range = [ 1, max( cont_high_quality ) ]
|
|
148 nclass = max( cont_high_quality )
|
|
149 if nclass > 100:
|
|
150 nclass = 100
|
|
151 r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
|
|
152 r.dev_off()
|
|
153
|
|
154 if infile_is_zipped and os.path.exists( infile_name ):
|
|
155 # Need to delete temporary file created when we unzipped the infile archive
|
|
156 os.remove( infile_name )
|
|
157
|
|
158 if invalid_lines > 0:
|
|
159 print 'Skipped %d invalid lines. ' % invalid_lines
|
|
160 if invalid_scores > 0:
|
|
161 print 'Skipped %d invalid scores. ' % invalid_scores
|
|
162
|
|
163 r.quit( save="no" )
|
|
164
|
|
165 if __name__=="__main__":__main__() |