Mercurial > repos > devteam > short_reads_figure_high_quality_length
comparison short_reads_figure_high_quality_length.py @ 0:556ceed24699 draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:34:37 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:556ceed24699 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os, sys, math, tempfile, zipfile, re | |
4 from rpy import * | |
5 | |
6 assert sys.version_info[:2] >= ( 2, 4 ) | |
7 | |
8 def stop_err( msg ): | |
9 sys.stderr.write( "%s\n" % msg ) | |
10 sys.exit() | |
11 | |
12 def unzip( filename ): | |
13 zip_file = zipfile.ZipFile( filename, 'r' ) | |
14 tmpfilename = tempfile.NamedTemporaryFile().name | |
15 for name in zip_file.namelist(): | |
16 file( tmpfilename, 'a' ).write( zip_file.read( name ) ) | |
17 zip_file.close() | |
18 return tmpfilename | |
19 | |
20 def __main__(): | |
21 infile_score_name = sys.argv[1].strip() | |
22 outfile_R_name = sys.argv[2].strip() | |
23 | |
24 try: | |
25 score_threshold = int( sys.argv[3].strip() ) | |
26 except: | |
27 stop_err( 'Threshold for quality score must be numerical.' ) | |
28 | |
29 infile_is_zipped = False | |
30 if zipfile.is_zipfile( infile_score_name ): | |
31 infile_is_zipped = True | |
32 infile_name = unzip( infile_score_name ) | |
33 else: | |
34 infile_name = infile_score_name | |
35 | |
36 # detect whether it's tabular or fasta format | |
37 seq_method = None | |
38 data_type = None | |
39 for i, line in enumerate( file( infile_name ) ): | |
40 line = line.rstrip( '\r\n' ) | |
41 if not line or line.startswith( '#' ): | |
42 continue | |
43 if data_type == None: | |
44 if line.startswith( '>' ): | |
45 data_type = 'fasta' | |
46 continue | |
47 elif len( line.split( '\t' ) ) > 0: | |
48 fields = line.split() | |
49 for score in fields: | |
50 try: | |
51 int( score ) | |
52 data_type = 'tabular' | |
53 seq_method = 'solexa' | |
54 break | |
55 except: | |
56 break | |
57 elif data_type == 'fasta': | |
58 fields = line.split() | |
59 for score in fields: | |
60 try: | |
61 int( score ) | |
62 seq_method = '454' | |
63 break | |
64 except: | |
65 break | |
66 if i == 100: | |
67 break | |
68 | |
69 if data_type is None: | |
70 stop_err( 'This tool can only use fasta data or tabular data.' ) | |
71 if seq_method is None: | |
72 stop_err( 'Invalid data for fasta format.') | |
73 | |
74 cont_high_quality = [] | |
75 invalid_lines = 0 | |
76 invalid_scores = 0 | |
77 if seq_method == 'solexa': | |
78 for i, line in enumerate( open( infile_name ) ): | |
79 line = line.rstrip( '\r\n' ) | |
80 if not line or line.startswith( '#' ): | |
81 continue | |
82 locs = line.split( '\t' ) | |
83 for j, base in enumerate( locs ): | |
84 nuc_errors = base.split() | |
85 try: | |
86 nuc_errors[0] = int( nuc_errors[0] ) | |
87 nuc_errors[1] = int( nuc_errors[1] ) | |
88 nuc_errors[2] = int( nuc_errors[2] ) | |
89 nuc_errors[3] = int( nuc_errors[3] ) | |
90 big = max( nuc_errors ) | |
91 except: | |
92 invalid_scores += 1 | |
93 big = 0 | |
94 if j == 0: | |
95 cont_high_quality.append(1) | |
96 else: | |
97 if big >= score_threshold: | |
98 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1 | |
99 else: | |
100 cont_high_quality.append(1) | |
101 else: # seq_method == '454' | |
102 tmp_score = '' | |
103 for i, line in enumerate( open( infile_name ) ): | |
104 line = line.rstrip( '\r\n' ) | |
105 if not line or line.startswith( '#' ): | |
106 continue | |
107 if line.startswith( '>' ): | |
108 if len( tmp_score ) > 0: | |
109 locs = tmp_score.split() | |
110 for j, base in enumerate( locs ): | |
111 try: | |
112 base = int( base ) | |
113 except: | |
114 invalid_scores += 1 | |
115 base = 0 | |
116 if j == 0: | |
117 cont_high_quality.append(1) | |
118 else: | |
119 if base >= score_threshold: | |
120 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1 | |
121 else: | |
122 cont_high_quality.append(1) | |
123 tmp_score = '' | |
124 else: | |
125 tmp_score = "%s %s" % ( tmp_score, line ) | |
126 if len( tmp_score ) > 0: | |
127 locs = tmp_score.split() | |
128 for j, base in enumerate( locs ): | |
129 try: | |
130 base = int( base ) | |
131 except: | |
132 invalid_scores += 1 | |
133 base = 0 | |
134 if j == 0: | |
135 cont_high_quality.append(1) | |
136 else: | |
137 if base >= score_threshold: | |
138 cont_high_quality[ len( cont_high_quality ) - 1 ] += 1 | |
139 else: | |
140 cont_high_quality.append(1) | |
141 | |
142 # generate pdf figures | |
143 cont_high_quality = array ( cont_high_quality ) | |
144 outfile_R_pdf = outfile_R_name | |
145 r.pdf( outfile_R_pdf ) | |
146 title = "Histogram of continuous high quality scores" | |
147 xlim_range = [ 1, max( cont_high_quality ) ] | |
148 nclass = max( cont_high_quality ) | |
149 if nclass > 100: | |
150 nclass = 100 | |
151 r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass) | |
152 r.dev_off() | |
153 | |
154 if infile_is_zipped and os.path.exists( infile_name ): | |
155 # Need to delete temporary file created when we unzipped the infile archive | |
156 os.remove( infile_name ) | |
157 | |
158 if invalid_lines > 0: | |
159 print 'Skipped %d invalid lines. ' % invalid_lines | |
160 if invalid_scores > 0: | |
161 print 'Skipped %d invalid scores. ' % invalid_scores | |
162 | |
163 r.quit( save="no" ) | |
164 | |
165 if __name__=="__main__":__main__() |