comparison ngs_filtering.py @ 12:cdf95051bc55 draft default tip

Uploaded 2 tools
author chmaramis
date Sun, 18 Mar 2018 07:11:06 -0400
parents
children
comparison
equal deleted inserted replaced
11:14896ea6e180 12:cdf95051bc55
1 # -*- coding: utf-8 -*-
2 """
3 Created on Wed Sep 4 18:41:42 2013
4
5 @author: chmaramis
6 """
7
8 from __future__ import division
9 import string as strpy
10 import numpy as np
11 from pandas import *
12 from numpy import nan as NA
13 import time
14 import sys
15
16
17 def filter_condition_AAjunction(x):
18 x= x.strip()
19 if ' ' in x:
20 return x.split(' ')[0]
21 else:
22 return x
23
24 #-----------frame creation---------------------
25 def filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname):
26
27 try:
28 path=inp
29 frame = DataFrame()
30 seqlen = []
31 head = []
32 tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 )
33 frame = concat([chunk for chunk in tp])
34
35 frcol = list(frame.columns)
36 #print frcol[-1]
37 if 'Unnamed' in frcol[-1]:
38 del frcol[-1]
39 frame=frame[frcol]
40
41 frame.index = range(1,len(frame)+1)
42
43 head.append('Total reads of raw data')
44 seqlen.append(len(frame))
45
46 #------------drop nulls--------------------
47 filtered = DataFrame()
48 filtall = DataFrame()
49 summ_df = DataFrame()
50 filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])]
51
52 filtall = filtall.append(filtered)
53 if len(filtall) > 0:
54 filtall.loc[filtered.index,'Reason'] = "NoResults"
55 frame = frame[frame['AA JUNCTION'].notnull()]
56 frame = frame[frame['V-GENE and allele'].notnull()]
57
58 head.append('Not Null CDR3/V')
59 head.append('filter out')
60 seqlen.append(len(frame))
61 seqlen.append(len(filtered))
62 filtered = DataFrame()
63
64 if psorf.startswith('y') or psorf.startswith('Y'):
65
66 cc0=np.array(frame['V-GENE and allele'].unique())
67
68
69 for x in cc0:
70 x1=x.split('*')
71 try:
72 if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1):
73 filtered = filtered.append(frame[frame['V-GENE and allele'] == x])
74 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA)
75 elif x.find('or')>-1:
76 posa=x.count('or')
77 x2=x.split('or')
78 x4=''
79 genelist=[]
80 for cnt in range(0, posa+1):
81 x3=x2[cnt].split('*')
82 x3[0]=x3[0].strip()#kobei ta space
83 k=x3[0].split(' ')# holds only TRBV
84 if cnt==0:
85 genelist.append(k[1])
86 x4+=k[1]
87 elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3
88 genelist.append(k[1])
89 x4+=' or '
90 x4+=k[1]
91 x3=None
92 k1=None
93 genelist=None
94
95 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4)
96
97 else:
98 s=x1[0].split(' ')
99 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1])
100 except IndexError as e:
101 print('V-gene is already been formed')
102 continue
103
104 x=None
105 x1=None
106 s=None
107
108 filtall = filtall.append(filtered)
109 if len(filtall) > 0:
110 filtall.loc[filtered.index,'Reason'] = 'P or ORF'
111 frame = frame[frame['V-GENE and allele'].notnull()]
112
113 head.append('Functional TRBV')
114 head.append('filter out')
115 seqlen.append(len(frame))
116 seqlen.append(len(filtered))
117 filtered = DataFrame()
118
119
120
121 #------------FILTERING for data quality--------------------
122 if con.startswith('y') or con.startswith('Y'):
123 filtered = frame [frame['AA JUNCTION'].str.contains('X') |
124 frame['AA JUNCTION'].str.contains('#') |
125 frame['AA JUNCTION'].str.contains('[*]')]
126
127
128
129 frame = frame [~frame['AA JUNCTION'].str.contains('X') &
130 ~frame['AA JUNCTION'].str.contains('#') &
131 ~frame['AA JUNCTION'].str.contains('[*]') ]
132
133
134 filtall = filtall.append(filtered)
135 if len(filtall) > 0:
136 filtall.loc[filtered.index,'Reason'] = 'X,#,*'
137 head.append('Not Containing X,#,*')
138 head.append('filter out')
139 seqlen.append(len(frame))
140 seqlen.append(len(filtered))
141 filtered = DataFrame()
142
143 # Set label of functionality column, taking into account current & past IMGT Summary column label
144 functionality_label = 'Functionality'
145 if 'V-DOMAIN Functionality' in frame.columns:
146 functionality_label = 'V-DOMAIN Functionality'
147
148 if prod.startswith('y') or prod.startswith('Y'):
149 filtered = frame[~frame[functionality_label].str.startswith('productive')]
150 filtall = filtall.append(filtered)
151 if len(filtall) > 0:
152 filtall.loc[filtered.index,'Reason'] = 'not productive'
153
154
155 frame=frame[frame[functionality_label].str.startswith('productive')]
156
157 head.append('Productive')
158 head.append('filter out')
159 seqlen.append(len(frame))
160
161 seqlen.append(len(filtered))
162
163
164 frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction)
165
166 if CF.startswith('y') or CF.startswith('Y'):
167 if cells == 'TCR':
168 filtered = DataFrame()
169 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
170 ~frame['AA JUNCTION'].str.endswith('F')]
171
172 filtall = filtall.append(filtered)
173 if len(filtall) > 0:
174 filtall.loc[filtered.index,'Reason'] = 'Not C..F'
175
176 frame = frame[frame['AA JUNCTION'].str.startswith('C') &
177 frame['AA JUNCTION'].str.endswith('F')]
178
179 head.append('CDR3 landmarks C-F')
180 head.append('filter out')
181 seqlen.append(len(frame))
182 seqlen.append(len(filtered))
183 filtered = DataFrame()
184 elif cells == 'BCR':
185 filtered = DataFrame()
186 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
187 ~frame['AA JUNCTION'].str.endswith('W')]
188
189 filtall = filtall.append(filtered)
190 if len(filtall) > 0:
191 filtall.loc[filtered.index,'Reason'] = 'Not C..W'
192
193 frame = frame[frame['AA JUNCTION'].str.startswith('C') &
194 frame['AA JUNCTION'].str.endswith('W')]
195
196 head.append('CDR3 landmarks C-W')
197 head.append('filter out')
198 seqlen.append(len(frame))
199 seqlen.append(len(filtered))
200 filtered = DataFrame()
201 else:
202 print('TCR or BCR type')
203
204
205 filtered = DataFrame()
206
207 filtered = frame[frame['V-REGION identity %'] < Vper]
208
209
210 filtall = filtall.append(filtered)
211 if len(filtall) > 0:
212 filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper)
213
214 frame=frame[frame['V-REGION identity %']>= Vper]
215 head.append('Identity >= {iden}%'.format(iden = Vper))
216 head.append('filter out')
217 seqlen.append(len(frame))
218 seqlen.append(len(filtered))
219
220 head.append('Total filter out A')
221 head.append('Total filter in A')
222 seqlen.append(len(filtall))
223 seqlen.append(len(frame))
224
225 ###############################
226 if Vgene != 'null':
227
228 filtered = DataFrame()
229
230 filtered = frame[frame['V-GENE and allele'] != Vgene]
231
232 filtall = filtall.append(filtered)
233 if len(filtall) > 0:
234 filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene)
235
236
237 frame = frame[frame['V-GENE and allele'] == Vgene]
238
239
240
241 head.append('V-GENE = {} '.format(Vgene))
242 head.append('filter out')
243 seqlen.append(len(frame))
244 seqlen.append(len(filtered))
245
246
247
248 ###############################
249 if (laa1 != 'null') or (laa2 != 'null'):
250 if int(laa2) == 0:
251 low = int(laa1)
252 high = 100
253 elif int(laa1) > int(laa2):
254 low = int(laa2)
255 high = int(laa1)
256 else:
257 low = int(laa1)
258 high = int(laa2)
259
260 filtered = DataFrame()
261 criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low)
262 criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high)
263 filtered = frame[criteria | criteria2]
264
265 filtall = filtall.append(filtered)
266 if int(laa2)==0:
267 if len(filtall) > 0:
268 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low)
269 else:
270 if len(filtall) > 0:
271 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high)
272
273 criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low)
274 criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high)
275 frame = frame[criteria3 & criteria4]
276
277 if int(laa2)==0:
278 head.append('CDR3 length bigger than {}'.format(low))
279 else:
280 head.append('CDR3 length from {} to {} '.format(low,high))
281 head.append('filter out')
282 seqlen.append(len(frame))
283 seqlen.append(len(filtered))
284
285 ###############################
286 if conaa != 'null':
287 if conaa.islower():
288 conaa = conaa.upper()
289 filtered = DataFrame()
290
291 filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)]
292
293 filtall = filtall.append(filtered)
294 if len(filtall) > 0:
295 filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa)
296
297 frame = frame[frame['AA JUNCTION'].str.contains(conaa) ]
298
299 head.append('CDR3 containing {}'.format(conaa))
300 head.append('filter out')
301 seqlen.append(len(frame))
302 seqlen.append(len(filtered))
303
304
305
306
307 #####------------keep the small J gene name--------------------
308 #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene)
309 cc2=np.array(frame['J-GENE and allele'].unique())
310
311 for x in cc2:
312 try:
313 if notnull(x):
314 x1=x.split('*')
315 # print(x)
316 # print (x1[0])
317 trbj=x1[0].split(' ')
318 frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1])
319 except IndexError as e:
320 print('J-Gene has been formed')
321
322
323
324 x=None
325 x1=None
326
327
328 #------------keep the small D gene name--------------------
329 cc1=np.array(frame['D-GENE and allele'].unique())
330 for x in cc1:
331 try:
332 if notnull(x):
333 x1=x.split('*')
334 trbd=x1[0].split(' ')
335 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1])
336 else:
337 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none')
338 except IndexError as e:
339 print('D-gene has been formed')
340
341
342 x=None
343 x1=None
344
345
346 if Jgene != 'null':
347
348 filtered = DataFrame()
349
350 filtered = frame[frame['J-GENE and allele'] != Jgene]
351
352 filtall = filtall.append(filtered)
353 if len(filtall) > 0:
354 filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene)
355
356
357 frame = frame[frame['J-GENE and allele'] == Jgene]
358
359
360
361 head.append('J-GENE = {} '.format(Jgene))
362 head.append('filter out')
363 seqlen.append(len(frame))
364 seqlen.append(len(filtered))
365
366
367
368 if Dgene != 'null':
369
370 filtered = DataFrame()
371
372 filtered = frame[frame['D-GENE and allele'] != Dgene]
373
374 filtall = filtall.append(filtered)
375 if len(filtall) > 0:
376 filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene)
377
378
379 frame = frame[frame['D-GENE and allele'] == Dgene]
380
381
382
383 head.append('D-GENE = {} '.format(Dgene))
384 head.append('filter out')
385 seqlen.append(len(frame))
386 seqlen.append(len(filtered))
387
388
389 head.append('Total filter out')
390 head.append('Total filter in')
391 seqlen.append(len(filtall))
392 seqlen.append(len(frame))
393 summ_df = DataFrame(index = head)
394 col = fname
395
396 summ_df[col] = seqlen
397 frame=frame.rename(columns = {'V-GENE and allele':'V-GENE',
398 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'})
399
400
401 frcol.append('Reason')
402
403 filtall = filtall[frcol]
404
405 #--------------out CSV---------------------------
406 frame.index = range(1,len(frame)+1)
407 if not summ_df.empty:
408 summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format))
409 return(frame,filtall,summ_df)
410 except KeyError as e:
411 print('This file has no ' + str(e) + ' column')
412 return(frame,filtall,summ_df)
413
414
415 if __name__ == '__main__':
416
417 start=time.time()
418
419 # Parse input arguments
420 inp = sys.argv[1]
421 cells = sys.argv[2]
422 psorf = sys.argv[3]
423 con = sys.argv[4]
424 prod = sys.argv[5]
425 CF = sys.argv[6]
426 Vper = float(sys.argv[7])
427 Vgene = sys.argv[8]
428 laa1 = sys.argv[9]
429 conaa = sys.argv[10]
430 filterin = sys.argv[11]
431 filterout = sys.argv[12]
432 Sum_table = sys.argv[13]
433 Jgene = sys.argv[14]
434 Dgene = sys.argv[15]
435 laa2 = sys.argv[16]
436 fname = sys.argv[17]
437
438 # Execute basic function
439 fin,fout,summ = filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname)
440
441 # Save output to CSV files
442 if not summ.empty:
443 summ.to_csv(Sum_table, sep = '\t')
444 if not fin.empty:
445 fin.to_csv(filterin , sep = '\t')
446 if not fout.empty:
447 fout.to_csv(filterout, sep= '\t')
448
449 # Print execution time
450 stop=time.time()
451 print('Runtime:' + str(stop-start))
452