comparison data_filtering.py @ 0:0e37e5b73273 draft

Initial commit
author chmaramis
date Fri, 30 Mar 2018 07:22:29 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0e37e5b73273
1 # -*- coding: utf-8 -*-
2 """
3 Created on Wed Sep 4 18:41:42 2013
4
5 @author: chmaramis
6 """
7
8 from __future__ import division
9 import string as strpy
10 import numpy as np
11 from pandas import *
12 from numpy import nan as NA
13 import time
14 import sys
15
16
17 def filter_condition_AAjunction(x):
18 x= x.strip()
19 if ' ' in x:
20 return x.split(' ')[0]
21 else:
22 return x
23
24 #-----------frame creation---------------------
25 def dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname):
26
27 try:
28 path=inp
29 frame = DataFrame()
30 seqlen = []
31 head = []
32 tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 )
33 frame = concat([chunk for chunk in tp])
34
35 frcol = list(frame.columns)
36 #print frcol[-1]
37 if 'Unnamed' in frcol[-1]:
38 del frcol[-1]
39 frame=frame[frcol]
40
41 frame.index = range(1,len(frame)+1)
42
43 head.append('Total reads of raw data')
44 seqlen.append(len(frame))
45
46 #------------drop nulls--------------------
47 filtered = DataFrame()
48 filtall = DataFrame()
49 summ_df = DataFrame()
50 filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])]
51
52 filtall = filtall.append(filtered)
53 if len(filtall) > 0:
54 filtall.loc[filtered.index,'Reason'] = "NoResults"
55 frame = frame[frame['AA JUNCTION'].notnull()]
56 frame = frame[frame['V-GENE and allele'].notnull()]
57
58 head.append('Not Null CDR3/V')
59 head.append('filter out')
60 seqlen.append(len(frame))
61 seqlen.append(len(filtered))
62 filtered = DataFrame()
63
64 if psorf.startswith('y') or psorf.startswith('Y'):
65
66 cc0=np.array(frame['V-GENE and allele'].unique())
67
68
69 for x in cc0:
70 x1=x.split('*')
71 try:
72 if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1):
73 filtered = filtered.append(frame[frame['V-GENE and allele'] == x])
74 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA)
75 elif x.find('or')>-1:
76 posa=x.count('or')
77 x2=x.split('or')
78 x4=''
79 genelist=[]
80 for cnt in range(0, posa+1):
81 x3=x2[cnt].split('*')
82 x3[0]=x3[0].strip()#kobei ta space
83 k=x3[0].split(' ')# holds only TRBV
84 if cnt==0:
85 genelist.append(k[1])
86 x4+=k[1]
87 elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3
88 genelist.append(k[1])
89 x4+=' or '
90 x4+=k[1]
91 x3=None
92 k1=None
93 genelist=None
94
95 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4)
96
97 else:
98 s=x1[0].split(' ')
99 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1])
100 except IndexError as e:
101 print('V-gene is already been formed')
102 continue
103
104 x=None
105 x1=None
106 s=None
107
108 filtall = filtall.append(filtered)
109 if len(filtall) > 0:
110 filtall.loc[filtered.index,'Reason'] = 'P or ORF'
111 frame = frame[frame['V-GENE and allele'].notnull()]
112
113 head.append('Functional TRBV')
114 head.append('filter out')
115 seqlen.append(len(frame))
116 seqlen.append(len(filtered))
117 filtered = DataFrame()
118
119
120
121 #------------FILTERING for data quality--------------------
122 if con.startswith('y') or con.startswith('Y'):
123 filtered = frame [frame['AA JUNCTION'].str.contains('X') |
124 frame['AA JUNCTION'].str.contains('#') |
125 frame['AA JUNCTION'].str.contains('[*]')]
126
127
128
129 frame = frame [~frame['AA JUNCTION'].str.contains('X') &
130 ~frame['AA JUNCTION'].str.contains('#') &
131 ~frame['AA JUNCTION'].str.contains('[*]') ]
132
133
134 filtall = filtall.append(filtered)
135 if len(filtall) > 0:
136 filtall.loc[filtered.index,'Reason'] = 'X,#,*'
137 head.append('Not Containing X,#,*')
138 head.append('filter out')
139 seqlen.append(len(frame))
140 seqlen.append(len(filtered))
141 filtered = DataFrame()
142
143
144
145 if prod.startswith('y') or prod.startswith('Y'):
146 filtered = frame[~frame['Functionality'].str.startswith('productive')]
147 filtall = filtall.append(filtered)
148 if len(filtall) > 0:
149 filtall.loc[filtered.index,'Reason'] = 'not productive'
150
151
152 frame=frame[frame['Functionality'].str.startswith('productive')]
153
154 head.append('Productive')
155 head.append('filter out')
156 seqlen.append(len(frame))
157
158 seqlen.append(len(filtered))
159
160
161 frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction)
162
163 if CF.startswith('y') or CF.startswith('Y'):
164 if cells == 'TCR':
165 filtered = DataFrame()
166 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
167 ~frame['AA JUNCTION'].str.endswith('F')]
168
169 filtall = filtall.append(filtered)
170 if len(filtall) > 0:
171 filtall.loc[filtered.index,'Reason'] = 'Not C..F'
172
173 frame = frame[frame['AA JUNCTION'].str.startswith('C') &
174 frame['AA JUNCTION'].str.endswith('F')]
175
176 head.append('CDR3 landmarks C-F')
177 head.append('filter out')
178 seqlen.append(len(frame))
179 seqlen.append(len(filtered))
180 filtered = DataFrame()
181 elif cells == 'BCR':
182 filtered = DataFrame()
183 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
184 ~frame['AA JUNCTION'].str.endswith('W')]
185
186 filtall = filtall.append(filtered)
187 if len(filtall) > 0:
188 filtall.loc[filtered.index,'Reason'] = 'Not C..W'
189
190 frame = frame[frame['AA JUNCTION'].str.startswith('C') &
191 frame['AA JUNCTION'].str.endswith('W')]
192
193 head.append('CDR3 landmarks C-W')
194 head.append('filter out')
195 seqlen.append(len(frame))
196 seqlen.append(len(filtered))
197 filtered = DataFrame()
198 else:
199 print('TCR or BCR type')
200
201
202 filtered = DataFrame()
203
204 filtered = frame[frame['V-REGION identity %'] < Vper]
205
206
207 filtall = filtall.append(filtered)
208 if len(filtall) > 0:
209 filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper)
210
211 frame=frame[frame['V-REGION identity %']>= Vper]
212 head.append('Identity >= {iden}%'.format(iden = Vper))
213 head.append('filter out')
214 seqlen.append(len(frame))
215 seqlen.append(len(filtered))
216
217 head.append('Total filter out A')
218 head.append('Total filter in A')
219 seqlen.append(len(filtall))
220 seqlen.append(len(frame))
221
222 ###############################
223 if Vgene != 'null':
224
225 filtered = DataFrame()
226
227 filtered = frame[frame['V-GENE and allele'] != Vgene]
228
229 filtall = filtall.append(filtered)
230 if len(filtall) > 0:
231 filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene)
232
233
234 frame = frame[frame['V-GENE and allele'] == Vgene]
235
236
237
238 head.append('V-GENE = {} '.format(Vgene))
239 head.append('filter out')
240 seqlen.append(len(frame))
241 seqlen.append(len(filtered))
242
243
244
245 ###############################
246 if (laa1 != 'null') or (laa2 != 'null'):
247 if int(laa2) == 0:
248 low = int(laa1)
249 high = 100
250 elif int(laa1) > int(laa2):
251 low = int(laa2)
252 high = int(laa1)
253 else:
254 low = int(laa1)
255 high = int(laa2)
256
257 filtered = DataFrame()
258 criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low)
259 criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high)
260 filtered = frame[criteria | criteria2]
261
262 filtall = filtall.append(filtered)
263 if int(laa2)==0:
264 if len(filtall) > 0:
265 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low)
266 else:
267 if len(filtall) > 0:
268 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high)
269
270 criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low)
271 criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high)
272 frame = frame[criteria3 & criteria4]
273
274 if int(laa2)==0:
275 head.append('CDR3 length bigger than {}'.format(low))
276 else:
277 head.append('CDR3 length from {} to {} '.format(low,high))
278 head.append('filter out')
279 seqlen.append(len(frame))
280 seqlen.append(len(filtered))
281
282 ###############################
283 if conaa != 'null':
284 if conaa.islower():
285 conaa = conaa.upper()
286 filtered = DataFrame()
287
288 filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)]
289
290 filtall = filtall.append(filtered)
291 if len(filtall) > 0:
292 filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa)
293
294 frame = frame[frame['AA JUNCTION'].str.contains(conaa) ]
295
296 head.append('CDR3 containing {}'.format(conaa))
297 head.append('filter out')
298 seqlen.append(len(frame))
299 seqlen.append(len(filtered))
300
301
302
303
304 #####------------keep the small J gene name--------------------
305 #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene)
306 cc2=np.array(frame['J-GENE and allele'].unique())
307
308 for x in cc2:
309 try:
310 if notnull(x):
311 x1=x.split('*')
312 # print(x)
313 # print (x1[0])
314 trbj=x1[0].split(' ')
315 frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1])
316 except IndexError as e:
317 print('J-Gene has been formed')
318
319
320
321 x=None
322 x1=None
323
324
325 #------------keep the small D gene name--------------------
326 cc1=np.array(frame['D-GENE and allele'].unique())
327 for x in cc1:
328 try:
329 if notnull(x):
330 x1=x.split('*')
331 trbd=x1[0].split(' ')
332 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1])
333 else:
334 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none')
335 except IndexError as e:
336 print('D-gene has been formed')
337
338
339 x=None
340 x1=None
341
342
343 if Jgene != 'null':
344
345 filtered = DataFrame()
346
347 filtered = frame[frame['J-GENE and allele'] != Jgene]
348
349 filtall = filtall.append(filtered)
350 if len(filtall) > 0:
351 filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene)
352
353
354 frame = frame[frame['J-GENE and allele'] == Jgene]
355
356
357
358 head.append('J-GENE = {} '.format(Jgene))
359 head.append('filter out')
360 seqlen.append(len(frame))
361 seqlen.append(len(filtered))
362
363
364
365 if Dgene != 'null':
366
367 filtered = DataFrame()
368
369 filtered = frame[frame['D-GENE and allele'] != Dgene]
370
371 filtall = filtall.append(filtered)
372 if len(filtall) > 0:
373 filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene)
374
375
376 frame = frame[frame['D-GENE and allele'] == Dgene]
377
378
379
380 head.append('D-GENE = {} '.format(Dgene))
381 head.append('filter out')
382 seqlen.append(len(frame))
383 seqlen.append(len(filtered))
384
385
386 head.append('Total filter out')
387 head.append('Total filter in')
388 seqlen.append(len(filtall))
389 seqlen.append(len(frame))
390 summ_df = DataFrame(index = head)
391 col = fname
392
393 summ_df[col] = seqlen
394 frame=frame.rename(columns = {'V-GENE and allele':'V-GENE',
395 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'})
396
397
398 frcol.append('Reason')
399
400 filtall = filtall[frcol]
401
402 #--------------out CSV---------------------------
403 frame.index = range(1,len(frame)+1)
404 if not summ_df.empty:
405 summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format))
406 return(frame,filtall,summ_df)
407 except KeyError as e:
408 print('This file has no ' + str(e) + ' column')
409 return(frame,filtall,summ_df)
410
411
412 if __name__ == '__main__':
413
414 start=time.time()
415
416 # Parse input arguments
417 inp = sys.argv[1]
418 cells = sys.argv[2]
419 psorf = sys.argv[3]
420 con = sys.argv[4]
421 prod = sys.argv[5]
422 CF = sys.argv[6]
423 Vper = float(sys.argv[7])
424 Vgene = sys.argv[8]
425 laa1 = sys.argv[9]
426 conaa = sys.argv[10]
427 filterin = sys.argv[11]
428 filterout = sys.argv[12]
429 Sum_table = sys.argv[13]
430 Jgene = sys.argv[14]
431 Dgene = sys.argv[15]
432 laa2 = sys.argv[16]
433 fname = sys.argv[17]
434
435 # Execute basic function
436 fin,fout,summ = dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname)
437
438 # Save output to CSV files
439 if not summ.empty:
440 summ.to_csv(Sum_table, sep = '\t')
441 if not fin.empty:
442 fin.to_csv(filterin , sep = '\t')
443 if not fout.empty:
444 fout.to_csv(filterout, sep= '\t')
445
446 # Print execution time
447 stop=time.time()
448 print('Runtime:' + str(stop-start))
449