Mercurial > repos > chmaramis > testirprofiler
comparison ngs_filtering.py @ 12:cdf95051bc55 draft default tip
Uploaded 2 tools
| author | chmaramis |
|---|---|
| date | Sun, 18 Mar 2018 07:11:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 11:14896ea6e180 | 12:cdf95051bc55 |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 Created on Wed Sep 4 18:41:42 2013 | |
| 4 | |
| 5 @author: chmaramis | |
| 6 """ | |
| 7 | |
| 8 from __future__ import division | |
| 9 import string as strpy | |
| 10 import numpy as np | |
| 11 from pandas import * | |
| 12 from numpy import nan as NA | |
| 13 import time | |
| 14 import sys | |
| 15 | |
| 16 | |
| 17 def filter_condition_AAjunction(x): | |
| 18 x= x.strip() | |
| 19 if ' ' in x: | |
| 20 return x.split(' ')[0] | |
| 21 else: | |
| 22 return x | |
| 23 | |
| 24 #-----------frame creation--------------------- | |
| 25 def filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname): | |
| 26 | |
| 27 try: | |
| 28 path=inp | |
| 29 frame = DataFrame() | |
| 30 seqlen = [] | |
| 31 head = [] | |
| 32 tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 ) | |
| 33 frame = concat([chunk for chunk in tp]) | |
| 34 | |
| 35 frcol = list(frame.columns) | |
| 36 #print frcol[-1] | |
| 37 if 'Unnamed' in frcol[-1]: | |
| 38 del frcol[-1] | |
| 39 frame=frame[frcol] | |
| 40 | |
| 41 frame.index = range(1,len(frame)+1) | |
| 42 | |
| 43 head.append('Total reads of raw data') | |
| 44 seqlen.append(len(frame)) | |
| 45 | |
| 46 #------------drop nulls-------------------- | |
| 47 filtered = DataFrame() | |
| 48 filtall = DataFrame() | |
| 49 summ_df = DataFrame() | |
| 50 filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])] | |
| 51 | |
| 52 filtall = filtall.append(filtered) | |
| 53 if len(filtall) > 0: | |
| 54 filtall.loc[filtered.index,'Reason'] = "NoResults" | |
| 55 frame = frame[frame['AA JUNCTION'].notnull()] | |
| 56 frame = frame[frame['V-GENE and allele'].notnull()] | |
| 57 | |
| 58 head.append('Not Null CDR3/V') | |
| 59 head.append('filter out') | |
| 60 seqlen.append(len(frame)) | |
| 61 seqlen.append(len(filtered)) | |
| 62 filtered = DataFrame() | |
| 63 | |
| 64 if psorf.startswith('y') or psorf.startswith('Y'): | |
| 65 | |
| 66 cc0=np.array(frame['V-GENE and allele'].unique()) | |
| 67 | |
| 68 | |
| 69 for x in cc0: | |
| 70 x1=x.split('*') | |
| 71 try: | |
| 72 if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1): | |
| 73 filtered = filtered.append(frame[frame['V-GENE and allele'] == x]) | |
| 74 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA) | |
| 75 elif x.find('or')>-1: | |
| 76 posa=x.count('or') | |
| 77 x2=x.split('or') | |
| 78 x4='' | |
| 79 genelist=[] | |
| 80 for cnt in range(0, posa+1): | |
| 81 x3=x2[cnt].split('*') | |
| 82 x3[0]=x3[0].strip()#kobei ta space | |
| 83 k=x3[0].split(' ')# holds only TRBV | |
| 84 if cnt==0: | |
| 85 genelist.append(k[1]) | |
| 86 x4+=k[1] | |
| 87 elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3 | |
| 88 genelist.append(k[1]) | |
| 89 x4+=' or ' | |
| 90 x4+=k[1] | |
| 91 x3=None | |
| 92 k1=None | |
| 93 genelist=None | |
| 94 | |
| 95 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4) | |
| 96 | |
| 97 else: | |
| 98 s=x1[0].split(' ') | |
| 99 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1]) | |
| 100 except IndexError as e: | |
| 101 print('V-gene is already been formed') | |
| 102 continue | |
| 103 | |
| 104 x=None | |
| 105 x1=None | |
| 106 s=None | |
| 107 | |
| 108 filtall = filtall.append(filtered) | |
| 109 if len(filtall) > 0: | |
| 110 filtall.loc[filtered.index,'Reason'] = 'P or ORF' | |
| 111 frame = frame[frame['V-GENE and allele'].notnull()] | |
| 112 | |
| 113 head.append('Functional TRBV') | |
| 114 head.append('filter out') | |
| 115 seqlen.append(len(frame)) | |
| 116 seqlen.append(len(filtered)) | |
| 117 filtered = DataFrame() | |
| 118 | |
| 119 | |
| 120 | |
| 121 #------------FILTERING for data quality-------------------- | |
| 122 if con.startswith('y') or con.startswith('Y'): | |
| 123 filtered = frame [frame['AA JUNCTION'].str.contains('X') | | |
| 124 frame['AA JUNCTION'].str.contains('#') | | |
| 125 frame['AA JUNCTION'].str.contains('[*]')] | |
| 126 | |
| 127 | |
| 128 | |
| 129 frame = frame [~frame['AA JUNCTION'].str.contains('X') & | |
| 130 ~frame['AA JUNCTION'].str.contains('#') & | |
| 131 ~frame['AA JUNCTION'].str.contains('[*]') ] | |
| 132 | |
| 133 | |
| 134 filtall = filtall.append(filtered) | |
| 135 if len(filtall) > 0: | |
| 136 filtall.loc[filtered.index,'Reason'] = 'X,#,*' | |
| 137 head.append('Not Containing X,#,*') | |
| 138 head.append('filter out') | |
| 139 seqlen.append(len(frame)) | |
| 140 seqlen.append(len(filtered)) | |
| 141 filtered = DataFrame() | |
| 142 | |
| 143 # Set label of functionality column, taking into account current & past IMGT Summary column label | |
| 144 functionality_label = 'Functionality' | |
| 145 if 'V-DOMAIN Functionality' in frame.columns: | |
| 146 functionality_label = 'V-DOMAIN Functionality' | |
| 147 | |
| 148 if prod.startswith('y') or prod.startswith('Y'): | |
| 149 filtered = frame[~frame[functionality_label].str.startswith('productive')] | |
| 150 filtall = filtall.append(filtered) | |
| 151 if len(filtall) > 0: | |
| 152 filtall.loc[filtered.index,'Reason'] = 'not productive' | |
| 153 | |
| 154 | |
| 155 frame=frame[frame[functionality_label].str.startswith('productive')] | |
| 156 | |
| 157 head.append('Productive') | |
| 158 head.append('filter out') | |
| 159 seqlen.append(len(frame)) | |
| 160 | |
| 161 seqlen.append(len(filtered)) | |
| 162 | |
| 163 | |
| 164 frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction) | |
| 165 | |
| 166 if CF.startswith('y') or CF.startswith('Y'): | |
| 167 if cells == 'TCR': | |
| 168 filtered = DataFrame() | |
| 169 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | | |
| 170 ~frame['AA JUNCTION'].str.endswith('F')] | |
| 171 | |
| 172 filtall = filtall.append(filtered) | |
| 173 if len(filtall) > 0: | |
| 174 filtall.loc[filtered.index,'Reason'] = 'Not C..F' | |
| 175 | |
| 176 frame = frame[frame['AA JUNCTION'].str.startswith('C') & | |
| 177 frame['AA JUNCTION'].str.endswith('F')] | |
| 178 | |
| 179 head.append('CDR3 landmarks C-F') | |
| 180 head.append('filter out') | |
| 181 seqlen.append(len(frame)) | |
| 182 seqlen.append(len(filtered)) | |
| 183 filtered = DataFrame() | |
| 184 elif cells == 'BCR': | |
| 185 filtered = DataFrame() | |
| 186 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | | |
| 187 ~frame['AA JUNCTION'].str.endswith('W')] | |
| 188 | |
| 189 filtall = filtall.append(filtered) | |
| 190 if len(filtall) > 0: | |
| 191 filtall.loc[filtered.index,'Reason'] = 'Not C..W' | |
| 192 | |
| 193 frame = frame[frame['AA JUNCTION'].str.startswith('C') & | |
| 194 frame['AA JUNCTION'].str.endswith('W')] | |
| 195 | |
| 196 head.append('CDR3 landmarks C-W') | |
| 197 head.append('filter out') | |
| 198 seqlen.append(len(frame)) | |
| 199 seqlen.append(len(filtered)) | |
| 200 filtered = DataFrame() | |
| 201 else: | |
| 202 print('TCR or BCR type') | |
| 203 | |
| 204 | |
| 205 filtered = DataFrame() | |
| 206 | |
| 207 filtered = frame[frame['V-REGION identity %'] < Vper] | |
| 208 | |
| 209 | |
| 210 filtall = filtall.append(filtered) | |
| 211 if len(filtall) > 0: | |
| 212 filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper) | |
| 213 | |
| 214 frame=frame[frame['V-REGION identity %']>= Vper] | |
| 215 head.append('Identity >= {iden}%'.format(iden = Vper)) | |
| 216 head.append('filter out') | |
| 217 seqlen.append(len(frame)) | |
| 218 seqlen.append(len(filtered)) | |
| 219 | |
| 220 head.append('Total filter out A') | |
| 221 head.append('Total filter in A') | |
| 222 seqlen.append(len(filtall)) | |
| 223 seqlen.append(len(frame)) | |
| 224 | |
| 225 ############################### | |
| 226 if Vgene != 'null': | |
| 227 | |
| 228 filtered = DataFrame() | |
| 229 | |
| 230 filtered = frame[frame['V-GENE and allele'] != Vgene] | |
| 231 | |
| 232 filtall = filtall.append(filtered) | |
| 233 if len(filtall) > 0: | |
| 234 filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene) | |
| 235 | |
| 236 | |
| 237 frame = frame[frame['V-GENE and allele'] == Vgene] | |
| 238 | |
| 239 | |
| 240 | |
| 241 head.append('V-GENE = {} '.format(Vgene)) | |
| 242 head.append('filter out') | |
| 243 seqlen.append(len(frame)) | |
| 244 seqlen.append(len(filtered)) | |
| 245 | |
| 246 | |
| 247 | |
| 248 ############################### | |
| 249 if (laa1 != 'null') or (laa2 != 'null'): | |
| 250 if int(laa2) == 0: | |
| 251 low = int(laa1) | |
| 252 high = 100 | |
| 253 elif int(laa1) > int(laa2): | |
| 254 low = int(laa2) | |
| 255 high = int(laa1) | |
| 256 else: | |
| 257 low = int(laa1) | |
| 258 high = int(laa2) | |
| 259 | |
| 260 filtered = DataFrame() | |
| 261 criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low) | |
| 262 criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high) | |
| 263 filtered = frame[criteria | criteria2] | |
| 264 | |
| 265 filtall = filtall.append(filtered) | |
| 266 if int(laa2)==0: | |
| 267 if len(filtall) > 0: | |
| 268 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low) | |
| 269 else: | |
| 270 if len(filtall) > 0: | |
| 271 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high) | |
| 272 | |
| 273 criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low) | |
| 274 criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high) | |
| 275 frame = frame[criteria3 & criteria4] | |
| 276 | |
| 277 if int(laa2)==0: | |
| 278 head.append('CDR3 length bigger than {}'.format(low)) | |
| 279 else: | |
| 280 head.append('CDR3 length from {} to {} '.format(low,high)) | |
| 281 head.append('filter out') | |
| 282 seqlen.append(len(frame)) | |
| 283 seqlen.append(len(filtered)) | |
| 284 | |
| 285 ############################### | |
| 286 if conaa != 'null': | |
| 287 if conaa.islower(): | |
| 288 conaa = conaa.upper() | |
| 289 filtered = DataFrame() | |
| 290 | |
| 291 filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)] | |
| 292 | |
| 293 filtall = filtall.append(filtered) | |
| 294 if len(filtall) > 0: | |
| 295 filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa) | |
| 296 | |
| 297 frame = frame[frame['AA JUNCTION'].str.contains(conaa) ] | |
| 298 | |
| 299 head.append('CDR3 containing {}'.format(conaa)) | |
| 300 head.append('filter out') | |
| 301 seqlen.append(len(frame)) | |
| 302 seqlen.append(len(filtered)) | |
| 303 | |
| 304 | |
| 305 | |
| 306 | |
| 307 #####------------keep the small J gene name-------------------- | |
| 308 #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene) | |
| 309 cc2=np.array(frame['J-GENE and allele'].unique()) | |
| 310 | |
| 311 for x in cc2: | |
| 312 try: | |
| 313 if notnull(x): | |
| 314 x1=x.split('*') | |
| 315 # print(x) | |
| 316 # print (x1[0]) | |
| 317 trbj=x1[0].split(' ') | |
| 318 frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1]) | |
| 319 except IndexError as e: | |
| 320 print('J-Gene has been formed') | |
| 321 | |
| 322 | |
| 323 | |
| 324 x=None | |
| 325 x1=None | |
| 326 | |
| 327 | |
| 328 #------------keep the small D gene name-------------------- | |
| 329 cc1=np.array(frame['D-GENE and allele'].unique()) | |
| 330 for x in cc1: | |
| 331 try: | |
| 332 if notnull(x): | |
| 333 x1=x.split('*') | |
| 334 trbd=x1[0].split(' ') | |
| 335 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1]) | |
| 336 else: | |
| 337 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none') | |
| 338 except IndexError as e: | |
| 339 print('D-gene has been formed') | |
| 340 | |
| 341 | |
| 342 x=None | |
| 343 x1=None | |
| 344 | |
| 345 | |
| 346 if Jgene != 'null': | |
| 347 | |
| 348 filtered = DataFrame() | |
| 349 | |
| 350 filtered = frame[frame['J-GENE and allele'] != Jgene] | |
| 351 | |
| 352 filtall = filtall.append(filtered) | |
| 353 if len(filtall) > 0: | |
| 354 filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene) | |
| 355 | |
| 356 | |
| 357 frame = frame[frame['J-GENE and allele'] == Jgene] | |
| 358 | |
| 359 | |
| 360 | |
| 361 head.append('J-GENE = {} '.format(Jgene)) | |
| 362 head.append('filter out') | |
| 363 seqlen.append(len(frame)) | |
| 364 seqlen.append(len(filtered)) | |
| 365 | |
| 366 | |
| 367 | |
| 368 if Dgene != 'null': | |
| 369 | |
| 370 filtered = DataFrame() | |
| 371 | |
| 372 filtered = frame[frame['D-GENE and allele'] != Dgene] | |
| 373 | |
| 374 filtall = filtall.append(filtered) | |
| 375 if len(filtall) > 0: | |
| 376 filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene) | |
| 377 | |
| 378 | |
| 379 frame = frame[frame['D-GENE and allele'] == Dgene] | |
| 380 | |
| 381 | |
| 382 | |
| 383 head.append('D-GENE = {} '.format(Dgene)) | |
| 384 head.append('filter out') | |
| 385 seqlen.append(len(frame)) | |
| 386 seqlen.append(len(filtered)) | |
| 387 | |
| 388 | |
| 389 head.append('Total filter out') | |
| 390 head.append('Total filter in') | |
| 391 seqlen.append(len(filtall)) | |
| 392 seqlen.append(len(frame)) | |
| 393 summ_df = DataFrame(index = head) | |
| 394 col = fname | |
| 395 | |
| 396 summ_df[col] = seqlen | |
| 397 frame=frame.rename(columns = {'V-GENE and allele':'V-GENE', | |
| 398 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'}) | |
| 399 | |
| 400 | |
| 401 frcol.append('Reason') | |
| 402 | |
| 403 filtall = filtall[frcol] | |
| 404 | |
| 405 #--------------out CSV--------------------------- | |
| 406 frame.index = range(1,len(frame)+1) | |
| 407 if not summ_df.empty: | |
| 408 summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format)) | |
| 409 return(frame,filtall,summ_df) | |
| 410 except KeyError as e: | |
| 411 print('This file has no ' + str(e) + ' column') | |
| 412 return(frame,filtall,summ_df) | |
| 413 | |
| 414 | |
| 415 if __name__ == '__main__': | |
| 416 | |
| 417 start=time.time() | |
| 418 | |
| 419 # Parse input arguments | |
| 420 inp = sys.argv[1] | |
| 421 cells = sys.argv[2] | |
| 422 psorf = sys.argv[3] | |
| 423 con = sys.argv[4] | |
| 424 prod = sys.argv[5] | |
| 425 CF = sys.argv[6] | |
| 426 Vper = float(sys.argv[7]) | |
| 427 Vgene = sys.argv[8] | |
| 428 laa1 = sys.argv[9] | |
| 429 conaa = sys.argv[10] | |
| 430 filterin = sys.argv[11] | |
| 431 filterout = sys.argv[12] | |
| 432 Sum_table = sys.argv[13] | |
| 433 Jgene = sys.argv[14] | |
| 434 Dgene = sys.argv[15] | |
| 435 laa2 = sys.argv[16] | |
| 436 fname = sys.argv[17] | |
| 437 | |
| 438 # Execute basic function | |
| 439 fin,fout,summ = filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname) | |
| 440 | |
| 441 # Save output to CSV files | |
| 442 if not summ.empty: | |
| 443 summ.to_csv(Sum_table, sep = '\t') | |
| 444 if not fin.empty: | |
| 445 fin.to_csv(filterin , sep = '\t') | |
| 446 if not fout.empty: | |
| 447 fout.to_csv(filterout, sep= '\t') | |
| 448 | |
| 449 # Print execution time | |
| 450 stop=time.time() | |
| 451 print('Runtime:' + str(stop-start)) | |
| 452 |
