Mercurial > repos > chmaramis > irprofiler
view data_filtering.py @ 1:acaa8e8a0b88 draft default tip
Uploaded test-data & added tool help
author | chmaramis |
---|---|
date | Mon, 30 Apr 2018 04:47:52 -0400 |
parents | 0e37e5b73273 |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Wed Sep 4 18:41:42 2013 @author: chmaramis """ from __future__ import division import string as strpy import numpy as np from pandas import * from numpy import nan as NA import time import sys def filter_condition_AAjunction(x): x= x.strip() if ' ' in x: return x.split(' ')[0] else: return x #-----------frame creation--------------------- def dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname): try: path=inp frame = DataFrame() seqlen = [] head = [] tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 ) frame = concat([chunk for chunk in tp]) frcol = list(frame.columns) #print frcol[-1] if 'Unnamed' in frcol[-1]: del frcol[-1] frame=frame[frcol] frame.index = range(1,len(frame)+1) head.append('Total reads of raw data') seqlen.append(len(frame)) #------------drop nulls-------------------- filtered = DataFrame() filtall = DataFrame() summ_df = DataFrame() filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = "NoResults" frame = frame[frame['AA JUNCTION'].notnull()] frame = frame[frame['V-GENE and allele'].notnull()] head.append('Not Null CDR3/V') head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) filtered = DataFrame() if psorf.startswith('y') or psorf.startswith('Y'): cc0=np.array(frame['V-GENE and allele'].unique()) for x in cc0: x1=x.split('*') try: if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1): filtered = filtered.append(frame[frame['V-GENE and allele'] == x]) frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA) elif x.find('or')>-1: posa=x.count('or') x2=x.split('or') x4='' genelist=[] for cnt in range(0, posa+1): x3=x2[cnt].split('*') x3[0]=x3[0].strip()#kobei ta space k=x3[0].split(' ')# holds only TRBV if cnt==0: genelist.append(k[1]) x4+=k[1] elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3 genelist.append(k[1]) x4+=' or ' x4+=k[1] x3=None k1=None genelist=None frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4) else: s=x1[0].split(' ') frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1]) except IndexError as e: print('V-gene is already been formed') continue x=None x1=None s=None filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'P or ORF' frame = frame[frame['V-GENE and allele'].notnull()] head.append('Functional TRBV') head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) filtered = DataFrame() #------------FILTERING for data quality-------------------- if con.startswith('y') or con.startswith('Y'): filtered = frame [frame['AA JUNCTION'].str.contains('X') | frame['AA JUNCTION'].str.contains('#') | frame['AA JUNCTION'].str.contains('[*]')] frame = frame [~frame['AA JUNCTION'].str.contains('X') & ~frame['AA JUNCTION'].str.contains('#') & ~frame['AA JUNCTION'].str.contains('[*]') ] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'X,#,*' head.append('Not Containing X,#,*') head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) filtered = DataFrame() if prod.startswith('y') or prod.startswith('Y'): filtered = frame[~frame['Functionality'].str.startswith('productive')] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'not productive' frame=frame[frame['Functionality'].str.startswith('productive')] head.append('Productive') head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction) if CF.startswith('y') or CF.startswith('Y'): if cells == 'TCR': filtered = DataFrame() filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | ~frame['AA JUNCTION'].str.endswith('F')] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'Not C..F' frame = frame[frame['AA JUNCTION'].str.startswith('C') & frame['AA JUNCTION'].str.endswith('F')] head.append('CDR3 landmarks C-F') head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) filtered = DataFrame() elif cells == 'BCR': filtered = DataFrame() filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | ~frame['AA JUNCTION'].str.endswith('W')] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'Not C..W' frame = frame[frame['AA JUNCTION'].str.startswith('C') & frame['AA JUNCTION'].str.endswith('W')] head.append('CDR3 landmarks C-W') head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) filtered = DataFrame() else: print('TCR or BCR type') filtered = DataFrame() filtered = frame[frame['V-REGION identity %'] < Vper] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper) frame=frame[frame['V-REGION identity %']>= Vper] head.append('Identity >= {iden}%'.format(iden = Vper)) head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) head.append('Total filter out A') head.append('Total filter in A') seqlen.append(len(filtall)) seqlen.append(len(frame)) ############################### if Vgene != 'null': filtered = DataFrame() filtered = frame[frame['V-GENE and allele'] != Vgene] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene) frame = frame[frame['V-GENE and allele'] == Vgene] head.append('V-GENE = {} '.format(Vgene)) head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) ############################### if (laa1 != 'null') or (laa2 != 'null'): if int(laa2) == 0: low = int(laa1) high = 100 elif int(laa1) > int(laa2): low = int(laa2) high = int(laa1) else: low = int(laa1) high = int(laa2) filtered = DataFrame() criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low) criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high) filtered = frame[criteria | criteria2] filtall = filtall.append(filtered) if int(laa2)==0: if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low) else: if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high) criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low) criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high) frame = frame[criteria3 & criteria4] if int(laa2)==0: head.append('CDR3 length bigger than {}'.format(low)) else: head.append('CDR3 length from {} to {} '.format(low,high)) head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) ############################### if conaa != 'null': if conaa.islower(): conaa = conaa.upper() filtered = DataFrame() filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa) frame = frame[frame['AA JUNCTION'].str.contains(conaa) ] head.append('CDR3 containing {}'.format(conaa)) head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) #####------------keep the small J gene name-------------------- #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene) cc2=np.array(frame['J-GENE and allele'].unique()) for x in cc2: try: if notnull(x): x1=x.split('*') # print(x) # print (x1[0]) trbj=x1[0].split(' ') frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1]) except IndexError as e: print('J-Gene has been formed') x=None x1=None #------------keep the small D gene name-------------------- cc1=np.array(frame['D-GENE and allele'].unique()) for x in cc1: try: if notnull(x): x1=x.split('*') trbd=x1[0].split(' ') frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1]) else: frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none') except IndexError as e: print('D-gene has been formed') x=None x1=None if Jgene != 'null': filtered = DataFrame() filtered = frame[frame['J-GENE and allele'] != Jgene] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene) frame = frame[frame['J-GENE and allele'] == Jgene] head.append('J-GENE = {} '.format(Jgene)) head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) if Dgene != 'null': filtered = DataFrame() filtered = frame[frame['D-GENE and allele'] != Dgene] filtall = filtall.append(filtered) if len(filtall) > 0: filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene) frame = frame[frame['D-GENE and allele'] == Dgene] head.append('D-GENE = {} '.format(Dgene)) head.append('filter out') seqlen.append(len(frame)) seqlen.append(len(filtered)) head.append('Total filter out') head.append('Total filter in') seqlen.append(len(filtall)) seqlen.append(len(frame)) summ_df = DataFrame(index = head) col = fname summ_df[col] = seqlen frame=frame.rename(columns = {'V-GENE and allele':'V-GENE', 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'}) frcol.append('Reason') filtall = filtall[frcol] #--------------out CSV--------------------------- frame.index = range(1,len(frame)+1) if not summ_df.empty: summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format)) return(frame,filtall,summ_df) except KeyError as e: print('This file has no ' + str(e) + ' column') return(frame,filtall,summ_df) if __name__ == '__main__': start=time.time() # Parse input arguments inp = sys.argv[1] cells = sys.argv[2] psorf = sys.argv[3] con = sys.argv[4] prod = sys.argv[5] CF = sys.argv[6] Vper = float(sys.argv[7]) Vgene = sys.argv[8] laa1 = sys.argv[9] conaa = sys.argv[10] filterin = sys.argv[11] filterout = sys.argv[12] Sum_table = sys.argv[13] Jgene = sys.argv[14] Dgene = sys.argv[15] laa2 = sys.argv[16] fname = sys.argv[17] # Execute basic function fin,fout,summ = dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname) # Save output to CSV files if not summ.empty: summ.to_csv(Sum_table, sep = '\t') if not fin.empty: fin.to_csv(filterin , sep = '\t') if not fout.empty: fout.to_csv(filterout, sep= '\t') # Print execution time stop=time.time() print('Runtime:' + str(stop-start))