Mercurial > repos > chmaramis > irprofiler
comparison data_filtering.py @ 0:0e37e5b73273 draft
Initial commit
author | chmaramis |
---|---|
date | Fri, 30 Mar 2018 07:22:29 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0e37e5b73273 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Wed Sep 4 18:41:42 2013 | |
4 | |
5 @author: chmaramis | |
6 """ | |
7 | |
8 from __future__ import division | |
9 import string as strpy | |
10 import numpy as np | |
11 from pandas import * | |
12 from numpy import nan as NA | |
13 import time | |
14 import sys | |
15 | |
16 | |
17 def filter_condition_AAjunction(x): | |
18 x= x.strip() | |
19 if ' ' in x: | |
20 return x.split(' ')[0] | |
21 else: | |
22 return x | |
23 | |
24 #-----------frame creation--------------------- | |
25 def dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname): | |
26 | |
27 try: | |
28 path=inp | |
29 frame = DataFrame() | |
30 seqlen = [] | |
31 head = [] | |
32 tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 ) | |
33 frame = concat([chunk for chunk in tp]) | |
34 | |
35 frcol = list(frame.columns) | |
36 #print frcol[-1] | |
37 if 'Unnamed' in frcol[-1]: | |
38 del frcol[-1] | |
39 frame=frame[frcol] | |
40 | |
41 frame.index = range(1,len(frame)+1) | |
42 | |
43 head.append('Total reads of raw data') | |
44 seqlen.append(len(frame)) | |
45 | |
46 #------------drop nulls-------------------- | |
47 filtered = DataFrame() | |
48 filtall = DataFrame() | |
49 summ_df = DataFrame() | |
50 filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])] | |
51 | |
52 filtall = filtall.append(filtered) | |
53 if len(filtall) > 0: | |
54 filtall.loc[filtered.index,'Reason'] = "NoResults" | |
55 frame = frame[frame['AA JUNCTION'].notnull()] | |
56 frame = frame[frame['V-GENE and allele'].notnull()] | |
57 | |
58 head.append('Not Null CDR3/V') | |
59 head.append('filter out') | |
60 seqlen.append(len(frame)) | |
61 seqlen.append(len(filtered)) | |
62 filtered = DataFrame() | |
63 | |
64 if psorf.startswith('y') or psorf.startswith('Y'): | |
65 | |
66 cc0=np.array(frame['V-GENE and allele'].unique()) | |
67 | |
68 | |
69 for x in cc0: | |
70 x1=x.split('*') | |
71 try: | |
72 if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1): | |
73 filtered = filtered.append(frame[frame['V-GENE and allele'] == x]) | |
74 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA) | |
75 elif x.find('or')>-1: | |
76 posa=x.count('or') | |
77 x2=x.split('or') | |
78 x4='' | |
79 genelist=[] | |
80 for cnt in range(0, posa+1): | |
81 x3=x2[cnt].split('*') | |
82 x3[0]=x3[0].strip()#kobei ta space | |
83 k=x3[0].split(' ')# holds only TRBV | |
84 if cnt==0: | |
85 genelist.append(k[1]) | |
86 x4+=k[1] | |
87 elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3 | |
88 genelist.append(k[1]) | |
89 x4+=' or ' | |
90 x4+=k[1] | |
91 x3=None | |
92 k1=None | |
93 genelist=None | |
94 | |
95 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4) | |
96 | |
97 else: | |
98 s=x1[0].split(' ') | |
99 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1]) | |
100 except IndexError as e: | |
101 print('V-gene is already been formed') | |
102 continue | |
103 | |
104 x=None | |
105 x1=None | |
106 s=None | |
107 | |
108 filtall = filtall.append(filtered) | |
109 if len(filtall) > 0: | |
110 filtall.loc[filtered.index,'Reason'] = 'P or ORF' | |
111 frame = frame[frame['V-GENE and allele'].notnull()] | |
112 | |
113 head.append('Functional TRBV') | |
114 head.append('filter out') | |
115 seqlen.append(len(frame)) | |
116 seqlen.append(len(filtered)) | |
117 filtered = DataFrame() | |
118 | |
119 | |
120 | |
121 #------------FILTERING for data quality-------------------- | |
122 if con.startswith('y') or con.startswith('Y'): | |
123 filtered = frame [frame['AA JUNCTION'].str.contains('X') | | |
124 frame['AA JUNCTION'].str.contains('#') | | |
125 frame['AA JUNCTION'].str.contains('[*]')] | |
126 | |
127 | |
128 | |
129 frame = frame [~frame['AA JUNCTION'].str.contains('X') & | |
130 ~frame['AA JUNCTION'].str.contains('#') & | |
131 ~frame['AA JUNCTION'].str.contains('[*]') ] | |
132 | |
133 | |
134 filtall = filtall.append(filtered) | |
135 if len(filtall) > 0: | |
136 filtall.loc[filtered.index,'Reason'] = 'X,#,*' | |
137 head.append('Not Containing X,#,*') | |
138 head.append('filter out') | |
139 seqlen.append(len(frame)) | |
140 seqlen.append(len(filtered)) | |
141 filtered = DataFrame() | |
142 | |
143 | |
144 | |
145 if prod.startswith('y') or prod.startswith('Y'): | |
146 filtered = frame[~frame['Functionality'].str.startswith('productive')] | |
147 filtall = filtall.append(filtered) | |
148 if len(filtall) > 0: | |
149 filtall.loc[filtered.index,'Reason'] = 'not productive' | |
150 | |
151 | |
152 frame=frame[frame['Functionality'].str.startswith('productive')] | |
153 | |
154 head.append('Productive') | |
155 head.append('filter out') | |
156 seqlen.append(len(frame)) | |
157 | |
158 seqlen.append(len(filtered)) | |
159 | |
160 | |
161 frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction) | |
162 | |
163 if CF.startswith('y') or CF.startswith('Y'): | |
164 if cells == 'TCR': | |
165 filtered = DataFrame() | |
166 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | | |
167 ~frame['AA JUNCTION'].str.endswith('F')] | |
168 | |
169 filtall = filtall.append(filtered) | |
170 if len(filtall) > 0: | |
171 filtall.loc[filtered.index,'Reason'] = 'Not C..F' | |
172 | |
173 frame = frame[frame['AA JUNCTION'].str.startswith('C') & | |
174 frame['AA JUNCTION'].str.endswith('F')] | |
175 | |
176 head.append('CDR3 landmarks C-F') | |
177 head.append('filter out') | |
178 seqlen.append(len(frame)) | |
179 seqlen.append(len(filtered)) | |
180 filtered = DataFrame() | |
181 elif cells == 'BCR': | |
182 filtered = DataFrame() | |
183 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | | |
184 ~frame['AA JUNCTION'].str.endswith('W')] | |
185 | |
186 filtall = filtall.append(filtered) | |
187 if len(filtall) > 0: | |
188 filtall.loc[filtered.index,'Reason'] = 'Not C..W' | |
189 | |
190 frame = frame[frame['AA JUNCTION'].str.startswith('C') & | |
191 frame['AA JUNCTION'].str.endswith('W')] | |
192 | |
193 head.append('CDR3 landmarks C-W') | |
194 head.append('filter out') | |
195 seqlen.append(len(frame)) | |
196 seqlen.append(len(filtered)) | |
197 filtered = DataFrame() | |
198 else: | |
199 print('TCR or BCR type') | |
200 | |
201 | |
202 filtered = DataFrame() | |
203 | |
204 filtered = frame[frame['V-REGION identity %'] < Vper] | |
205 | |
206 | |
207 filtall = filtall.append(filtered) | |
208 if len(filtall) > 0: | |
209 filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper) | |
210 | |
211 frame=frame[frame['V-REGION identity %']>= Vper] | |
212 head.append('Identity >= {iden}%'.format(iden = Vper)) | |
213 head.append('filter out') | |
214 seqlen.append(len(frame)) | |
215 seqlen.append(len(filtered)) | |
216 | |
217 head.append('Total filter out A') | |
218 head.append('Total filter in A') | |
219 seqlen.append(len(filtall)) | |
220 seqlen.append(len(frame)) | |
221 | |
222 ############################### | |
223 if Vgene != 'null': | |
224 | |
225 filtered = DataFrame() | |
226 | |
227 filtered = frame[frame['V-GENE and allele'] != Vgene] | |
228 | |
229 filtall = filtall.append(filtered) | |
230 if len(filtall) > 0: | |
231 filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene) | |
232 | |
233 | |
234 frame = frame[frame['V-GENE and allele'] == Vgene] | |
235 | |
236 | |
237 | |
238 head.append('V-GENE = {} '.format(Vgene)) | |
239 head.append('filter out') | |
240 seqlen.append(len(frame)) | |
241 seqlen.append(len(filtered)) | |
242 | |
243 | |
244 | |
245 ############################### | |
246 if (laa1 != 'null') or (laa2 != 'null'): | |
247 if int(laa2) == 0: | |
248 low = int(laa1) | |
249 high = 100 | |
250 elif int(laa1) > int(laa2): | |
251 low = int(laa2) | |
252 high = int(laa1) | |
253 else: | |
254 low = int(laa1) | |
255 high = int(laa2) | |
256 | |
257 filtered = DataFrame() | |
258 criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low) | |
259 criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high) | |
260 filtered = frame[criteria | criteria2] | |
261 | |
262 filtall = filtall.append(filtered) | |
263 if int(laa2)==0: | |
264 if len(filtall) > 0: | |
265 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low) | |
266 else: | |
267 if len(filtall) > 0: | |
268 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high) | |
269 | |
270 criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low) | |
271 criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high) | |
272 frame = frame[criteria3 & criteria4] | |
273 | |
274 if int(laa2)==0: | |
275 head.append('CDR3 length bigger than {}'.format(low)) | |
276 else: | |
277 head.append('CDR3 length from {} to {} '.format(low,high)) | |
278 head.append('filter out') | |
279 seqlen.append(len(frame)) | |
280 seqlen.append(len(filtered)) | |
281 | |
282 ############################### | |
283 if conaa != 'null': | |
284 if conaa.islower(): | |
285 conaa = conaa.upper() | |
286 filtered = DataFrame() | |
287 | |
288 filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)] | |
289 | |
290 filtall = filtall.append(filtered) | |
291 if len(filtall) > 0: | |
292 filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa) | |
293 | |
294 frame = frame[frame['AA JUNCTION'].str.contains(conaa) ] | |
295 | |
296 head.append('CDR3 containing {}'.format(conaa)) | |
297 head.append('filter out') | |
298 seqlen.append(len(frame)) | |
299 seqlen.append(len(filtered)) | |
300 | |
301 | |
302 | |
303 | |
304 #####------------keep the small J gene name-------------------- | |
305 #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene) | |
306 cc2=np.array(frame['J-GENE and allele'].unique()) | |
307 | |
308 for x in cc2: | |
309 try: | |
310 if notnull(x): | |
311 x1=x.split('*') | |
312 # print(x) | |
313 # print (x1[0]) | |
314 trbj=x1[0].split(' ') | |
315 frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1]) | |
316 except IndexError as e: | |
317 print('J-Gene has been formed') | |
318 | |
319 | |
320 | |
321 x=None | |
322 x1=None | |
323 | |
324 | |
325 #------------keep the small D gene name-------------------- | |
326 cc1=np.array(frame['D-GENE and allele'].unique()) | |
327 for x in cc1: | |
328 try: | |
329 if notnull(x): | |
330 x1=x.split('*') | |
331 trbd=x1[0].split(' ') | |
332 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1]) | |
333 else: | |
334 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none') | |
335 except IndexError as e: | |
336 print('D-gene has been formed') | |
337 | |
338 | |
339 x=None | |
340 x1=None | |
341 | |
342 | |
343 if Jgene != 'null': | |
344 | |
345 filtered = DataFrame() | |
346 | |
347 filtered = frame[frame['J-GENE and allele'] != Jgene] | |
348 | |
349 filtall = filtall.append(filtered) | |
350 if len(filtall) > 0: | |
351 filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene) | |
352 | |
353 | |
354 frame = frame[frame['J-GENE and allele'] == Jgene] | |
355 | |
356 | |
357 | |
358 head.append('J-GENE = {} '.format(Jgene)) | |
359 head.append('filter out') | |
360 seqlen.append(len(frame)) | |
361 seqlen.append(len(filtered)) | |
362 | |
363 | |
364 | |
365 if Dgene != 'null': | |
366 | |
367 filtered = DataFrame() | |
368 | |
369 filtered = frame[frame['D-GENE and allele'] != Dgene] | |
370 | |
371 filtall = filtall.append(filtered) | |
372 if len(filtall) > 0: | |
373 filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene) | |
374 | |
375 | |
376 frame = frame[frame['D-GENE and allele'] == Dgene] | |
377 | |
378 | |
379 | |
380 head.append('D-GENE = {} '.format(Dgene)) | |
381 head.append('filter out') | |
382 seqlen.append(len(frame)) | |
383 seqlen.append(len(filtered)) | |
384 | |
385 | |
386 head.append('Total filter out') | |
387 head.append('Total filter in') | |
388 seqlen.append(len(filtall)) | |
389 seqlen.append(len(frame)) | |
390 summ_df = DataFrame(index = head) | |
391 col = fname | |
392 | |
393 summ_df[col] = seqlen | |
394 frame=frame.rename(columns = {'V-GENE and allele':'V-GENE', | |
395 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'}) | |
396 | |
397 | |
398 frcol.append('Reason') | |
399 | |
400 filtall = filtall[frcol] | |
401 | |
402 #--------------out CSV--------------------------- | |
403 frame.index = range(1,len(frame)+1) | |
404 if not summ_df.empty: | |
405 summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format)) | |
406 return(frame,filtall,summ_df) | |
407 except KeyError as e: | |
408 print('This file has no ' + str(e) + ' column') | |
409 return(frame,filtall,summ_df) | |
410 | |
411 | |
412 if __name__ == '__main__': | |
413 | |
414 start=time.time() | |
415 | |
416 # Parse input arguments | |
417 inp = sys.argv[1] | |
418 cells = sys.argv[2] | |
419 psorf = sys.argv[3] | |
420 con = sys.argv[4] | |
421 prod = sys.argv[5] | |
422 CF = sys.argv[6] | |
423 Vper = float(sys.argv[7]) | |
424 Vgene = sys.argv[8] | |
425 laa1 = sys.argv[9] | |
426 conaa = sys.argv[10] | |
427 filterin = sys.argv[11] | |
428 filterout = sys.argv[12] | |
429 Sum_table = sys.argv[13] | |
430 Jgene = sys.argv[14] | |
431 Dgene = sys.argv[15] | |
432 laa2 = sys.argv[16] | |
433 fname = sys.argv[17] | |
434 | |
435 # Execute basic function | |
436 fin,fout,summ = dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname) | |
437 | |
438 # Save output to CSV files | |
439 if not summ.empty: | |
440 summ.to_csv(Sum_table, sep = '\t') | |
441 if not fin.empty: | |
442 fin.to_csv(filterin , sep = '\t') | |
443 if not fout.empty: | |
444 fout.to_csv(filterout, sep= '\t') | |
445 | |
446 # Print execution time | |
447 stop=time.time() | |
448 print('Runtime:' + str(stop-start)) | |
449 |