0
|
1 #!/usr/bin/env python
|
|
2 #Guruprasad Ananda
|
|
3 """
|
|
4 Filter based on nucleotide quality (PHRED score).
|
|
5
|
|
6 usage: %prog input out_file primary_species mask_species score mask_char mask_region mask_region_length
|
|
7 """
|
|
8
|
|
9
|
|
10 from __future__ import division
|
|
11 from galaxy import eggs
|
|
12 import pkg_resources
|
|
13 pkg_resources.require( "lrucache" )
|
|
14 import numpy
|
|
15
|
|
16 import sys
|
|
17 import os, os.path
|
|
18 from UserDict import DictMixin
|
|
19 from bx.binned_array import FileBinnedArray
|
|
20 from bx.bitset import *
|
|
21 from bx.bitset_builders import *
|
|
22 from bx.cookbook import doc_optparse
|
|
23 import bx.align.maf
|
|
24
|
|
25 class FileBinnedArrayDir( DictMixin ):
|
|
26 """
|
|
27 Adapter that makes a directory of FileBinnedArray files look like
|
|
28 a regular dict of BinnedArray objects.
|
|
29 """
|
|
30 def __init__( self, dir ):
|
|
31 self.dir = dir
|
|
32 self.cache = dict()
|
|
33 def __getitem__( self, key ):
|
|
34 value = None
|
|
35 if key in self.cache:
|
|
36 value = self.cache[key]
|
|
37 else:
|
|
38 fname = os.path.join( self.dir, "%s.qa.bqv" % key )
|
|
39 if os.path.exists( fname ):
|
|
40 value = FileBinnedArray( open( fname ) )
|
|
41 self.cache[key] = value
|
|
42 if value is None:
|
|
43 raise KeyError( "File does not exist: " + fname )
|
|
44 return value
|
|
45
|
|
46 def stop_err(msg):
|
|
47 sys.stderr.write(msg)
|
|
48 sys.exit()
|
|
49
|
|
50 def load_scores_ba_dir( dir ):
|
|
51 """
|
|
52 Return a dict-like object (keyed by chromosome) that returns
|
|
53 FileBinnedArray objects created from "key.ba" files in `dir`
|
|
54 """
|
|
55 return FileBinnedArrayDir( dir )
|
|
56
|
|
57 def bitwise_and ( string1, string2, maskch ):
|
|
58 result = []
|
|
59 for i, ch in enumerate(string1):
|
|
60 try:
|
|
61 ch = int(ch)
|
|
62 except:
|
|
63 pass
|
|
64 if string2[i] == '-':
|
|
65 ch = 1
|
|
66 if ch and string2[i]:
|
|
67 result.append(string2[i])
|
|
68 else:
|
|
69 result.append(maskch)
|
|
70 return ''.join(result)
|
|
71
|
|
72 def main():
|
|
73 # Parsing Command Line here
|
|
74 options, args = doc_optparse.parse( __doc__ )
|
|
75
|
|
76 try:
|
|
77 #chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )
|
|
78 inp_file, out_file, pri_species, mask_species, qual_cutoff, mask_chr, mask_region, mask_length, loc_file = args
|
|
79 qual_cutoff = int(qual_cutoff)
|
|
80 mask_chr = int(mask_chr)
|
|
81 mask_region = int(mask_region)
|
|
82 if mask_region != 3:
|
|
83 mask_length = int(mask_length)
|
|
84 else:
|
|
85 mask_length_r = int(mask_length.split(',')[0])
|
|
86 mask_length_l = int(mask_length.split(',')[1])
|
|
87 except:
|
|
88 stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." )
|
|
89
|
|
90 if pri_species == 'None':
|
|
91 stop_err( "No primary species selected, try again by selecting at least one primary species." )
|
|
92 if mask_species == 'None':
|
|
93 stop_err( "No mask species selected, try again by selecting at least one species to mask." )
|
|
94
|
|
95 mask_chr_count = 0
|
|
96 mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'}
|
|
97 mask_reg_dict = {0:'Current pos', 1:'Current+Downstream', 2:'Current+Upstream', 3:'Current+Both sides'}
|
|
98
|
|
99 #ensure dbkey is present in the twobit loc file
|
|
100 try:
|
|
101 pspecies_all = pri_species.split(',')
|
|
102 pspecies_all2 = pri_species.split(',')
|
|
103 pspecies = []
|
|
104 filepaths = []
|
|
105 for line in open(loc_file):
|
|
106 if pspecies_all2 == []:
|
|
107 break
|
|
108 if line[0:1] == "#":
|
|
109 continue
|
|
110 fields = line.split('\t')
|
|
111 try:
|
|
112 build = fields[0]
|
|
113 for i, dbkey in enumerate(pspecies_all2):
|
|
114 if dbkey == build:
|
|
115 pspecies.append(build)
|
|
116 filepaths.append(fields[1])
|
|
117 del pspecies_all2[i]
|
|
118 else:
|
|
119 continue
|
|
120 except:
|
|
121 pass
|
|
122 except Exception, exc:
|
|
123 stop_err( 'Initialization errorL %s' % str( exc ) )
|
|
124
|
|
125 if len(pspecies) == 0:
|
|
126 stop_err( "Quality scores are not available for the following genome builds: %s" % ( pspecies_all2 ) )
|
|
127 if len(pspecies) < len(pspecies_all):
|
|
128 print "Quality scores are not available for the following genome builds: %s" % (pspecies_all2)
|
|
129
|
|
130 scores_by_chrom = []
|
|
131 #Get scores for all the primary species
|
|
132 for file in filepaths:
|
|
133 scores_by_chrom.append(load_scores_ba_dir( file.strip() ))
|
|
134
|
|
135 try:
|
|
136 maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
|
|
137 maf_writer = bx.align.maf.Writer( open(out_file,'w') )
|
|
138 except Exception, e:
|
|
139 stop_err( "Your MAF file appears to be malformed: %s" % str( e ) )
|
|
140
|
|
141 maf_count = 0
|
|
142 for block in maf_reader:
|
|
143 status_strings = []
|
|
144 for seq in range (len(block.components)):
|
|
145 src = block.components[seq].src
|
|
146 dbkey = src.split('.')[0]
|
|
147 chr = src.split('.')[1]
|
|
148 if not (dbkey in pspecies):
|
|
149 continue
|
|
150 else: #enter if the species is a primary species
|
|
151 index = pspecies.index(dbkey)
|
|
152 sequence = block.components[seq].text
|
|
153 s_start = block.components[seq].start
|
|
154 size = len(sequence) #this includes the gaps too
|
|
155 status_str = '1'*size
|
|
156 status_list = list(status_str)
|
|
157 if status_strings == []:
|
|
158 status_strings.append(status_str)
|
|
159 ind = 0
|
|
160 s_end = block.components[seq].end
|
|
161 #Get scores for the entire sequence
|
|
162 try:
|
|
163 scores = scores_by_chrom[index][chr][s_start:s_end]
|
|
164 except:
|
|
165 continue
|
|
166 pos = 0
|
|
167 while pos < (s_end-s_start):
|
|
168 if sequence[ind] == '-': #No score for GAPS
|
|
169 ind += 1
|
|
170 continue
|
|
171 score = scores[pos]
|
|
172 if score < qual_cutoff:
|
|
173 score = 0
|
|
174
|
|
175 if not(score):
|
|
176 if mask_region == 0: #Mask Corresponding position only
|
|
177 status_list[ind] = '0'
|
|
178 ind += 1
|
|
179 pos += 1
|
|
180 elif mask_region == 1: #Mask Corresponding position + downstream neighbors
|
|
181 for n in range(mask_length+1):
|
|
182 try:
|
|
183 status_list[ind+n] = '0'
|
|
184 except:
|
|
185 pass
|
|
186 ind = ind + mask_length + 1
|
|
187 pos = pos + mask_length + 1
|
|
188 elif mask_region == 2: #Mask Corresponding position + upstream neighbors
|
|
189 for n in range(mask_length+1):
|
|
190 try:
|
|
191 status_list[ind-n] = '0'
|
|
192 except:
|
|
193 pass
|
|
194 ind += 1
|
|
195 pos += 1
|
|
196 elif mask_region == 3: #Mask Corresponding position + neighbors on both sides
|
|
197 for n in range(-mask_length_l, mask_length_r+1):
|
|
198 try:
|
|
199 status_list[ind+n] = '0'
|
|
200 except:
|
|
201 pass
|
|
202 ind = ind + mask_length_r + 1
|
|
203 pos = pos + mask_length_r + 1
|
|
204 else:
|
|
205 pos += 1
|
|
206 ind += 1
|
|
207
|
|
208 status_strings.append(''.join(status_list))
|
|
209
|
|
210 if status_strings == []: #this block has no primary species
|
|
211 continue
|
|
212 output_status_str = status_strings[0]
|
|
213 for stat in status_strings[1:]:
|
|
214 try:
|
|
215 output_status_str = bitwise_and (status_strings[0], stat, '0')
|
|
216 except Exception, e:
|
|
217 break
|
|
218
|
|
219 for seq in range (len(block.components)):
|
|
220 src = block.components[seq].src
|
|
221 dbkey = src.split('.')[0]
|
|
222 if dbkey not in mask_species.split(','):
|
|
223 continue
|
|
224 sequence = block.components[seq].text
|
|
225 sequence = bitwise_and (output_status_str, sequence, mask_chr_dict[mask_chr])
|
|
226 block.components[seq].text = sequence
|
|
227 mask_chr_count += output_status_str.count('0')
|
|
228 maf_writer.write(block)
|
|
229 maf_count += 1
|
|
230
|
|
231 maf_reader.close()
|
|
232 maf_writer.close()
|
|
233 print "No. of blocks = %d; No. of masked nucleotides = %s; Mask character = %s; Mask region = %s; Cutoff used = %d" % (maf_count, mask_chr_count, mask_chr_dict[mask_chr], mask_reg_dict[mask_region], qual_cutoff)
|
|
234
|
|
235
|
|
236 if __name__ == "__main__":
|
|
237 main()
|