comparison short_reads_figure_high_quality_length.py @ 0:556ceed24699 draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:34:37 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:556ceed24699
1 #!/usr/bin/env python
2
3 import os, sys, math, tempfile, zipfile, re
4 from rpy import *
5
6 assert sys.version_info[:2] >= ( 2, 4 )
7
8 def stop_err( msg ):
9 sys.stderr.write( "%s\n" % msg )
10 sys.exit()
11
12 def unzip( filename ):
13 zip_file = zipfile.ZipFile( filename, 'r' )
14 tmpfilename = tempfile.NamedTemporaryFile().name
15 for name in zip_file.namelist():
16 file( tmpfilename, 'a' ).write( zip_file.read( name ) )
17 zip_file.close()
18 return tmpfilename
19
20 def __main__():
21 infile_score_name = sys.argv[1].strip()
22 outfile_R_name = sys.argv[2].strip()
23
24 try:
25 score_threshold = int( sys.argv[3].strip() )
26 except:
27 stop_err( 'Threshold for quality score must be numerical.' )
28
29 infile_is_zipped = False
30 if zipfile.is_zipfile( infile_score_name ):
31 infile_is_zipped = True
32 infile_name = unzip( infile_score_name )
33 else:
34 infile_name = infile_score_name
35
36 # detect whether it's tabular or fasta format
37 seq_method = None
38 data_type = None
39 for i, line in enumerate( file( infile_name ) ):
40 line = line.rstrip( '\r\n' )
41 if not line or line.startswith( '#' ):
42 continue
43 if data_type == None:
44 if line.startswith( '>' ):
45 data_type = 'fasta'
46 continue
47 elif len( line.split( '\t' ) ) > 0:
48 fields = line.split()
49 for score in fields:
50 try:
51 int( score )
52 data_type = 'tabular'
53 seq_method = 'solexa'
54 break
55 except:
56 break
57 elif data_type == 'fasta':
58 fields = line.split()
59 for score in fields:
60 try:
61 int( score )
62 seq_method = '454'
63 break
64 except:
65 break
66 if i == 100:
67 break
68
69 if data_type is None:
70 stop_err( 'This tool can only use fasta data or tabular data.' )
71 if seq_method is None:
72 stop_err( 'Invalid data for fasta format.')
73
74 cont_high_quality = []
75 invalid_lines = 0
76 invalid_scores = 0
77 if seq_method == 'solexa':
78 for i, line in enumerate( open( infile_name ) ):
79 line = line.rstrip( '\r\n' )
80 if not line or line.startswith( '#' ):
81 continue
82 locs = line.split( '\t' )
83 for j, base in enumerate( locs ):
84 nuc_errors = base.split()
85 try:
86 nuc_errors[0] = int( nuc_errors[0] )
87 nuc_errors[1] = int( nuc_errors[1] )
88 nuc_errors[2] = int( nuc_errors[2] )
89 nuc_errors[3] = int( nuc_errors[3] )
90 big = max( nuc_errors )
91 except:
92 invalid_scores += 1
93 big = 0
94 if j == 0:
95 cont_high_quality.append(1)
96 else:
97 if big >= score_threshold:
98 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
99 else:
100 cont_high_quality.append(1)
101 else: # seq_method == '454'
102 tmp_score = ''
103 for i, line in enumerate( open( infile_name ) ):
104 line = line.rstrip( '\r\n' )
105 if not line or line.startswith( '#' ):
106 continue
107 if line.startswith( '>' ):
108 if len( tmp_score ) > 0:
109 locs = tmp_score.split()
110 for j, base in enumerate( locs ):
111 try:
112 base = int( base )
113 except:
114 invalid_scores += 1
115 base = 0
116 if j == 0:
117 cont_high_quality.append(1)
118 else:
119 if base >= score_threshold:
120 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
121 else:
122 cont_high_quality.append(1)
123 tmp_score = ''
124 else:
125 tmp_score = "%s %s" % ( tmp_score, line )
126 if len( tmp_score ) > 0:
127 locs = tmp_score.split()
128 for j, base in enumerate( locs ):
129 try:
130 base = int( base )
131 except:
132 invalid_scores += 1
133 base = 0
134 if j == 0:
135 cont_high_quality.append(1)
136 else:
137 if base >= score_threshold:
138 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
139 else:
140 cont_high_quality.append(1)
141
142 # generate pdf figures
143 cont_high_quality = array ( cont_high_quality )
144 outfile_R_pdf = outfile_R_name
145 r.pdf( outfile_R_pdf )
146 title = "Histogram of continuous high quality scores"
147 xlim_range = [ 1, max( cont_high_quality ) ]
148 nclass = max( cont_high_quality )
149 if nclass > 100:
150 nclass = 100
151 r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
152 r.dev_off()
153
154 if infile_is_zipped and os.path.exists( infile_name ):
155 # Need to delete temporary file created when we unzipped the infile archive
156 os.remove( infile_name )
157
158 if invalid_lines > 0:
159 print 'Skipped %d invalid lines. ' % invalid_lines
160 if invalid_scores > 0:
161 print 'Skipped %d invalid scores. ' % invalid_scores
162
163 r.quit( save="no" )
164
165 if __name__=="__main__":__main__()