annotate data_filtering.py @ 1:acaa8e8a0b88 draft default tip

Uploaded test-data & added tool help
author chmaramis
date Mon, 30 Apr 2018 04:47:52 -0400
parents 0e37e5b73273
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
1 # -*- coding: utf-8 -*-
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
2 """
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
3 Created on Wed Sep 4 18:41:42 2013
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
4
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
5 @author: chmaramis
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
6 """
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
7
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
8 from __future__ import division
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
9 import string as strpy
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
10 import numpy as np
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
11 from pandas import *
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
12 from numpy import nan as NA
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
13 import time
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
14 import sys
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
15
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
16
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
17 def filter_condition_AAjunction(x):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
18 x= x.strip()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
19 if ' ' in x:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
20 return x.split(' ')[0]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
21 else:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
22 return x
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
23
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
24 #-----------frame creation---------------------
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
25 def dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
26
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
27 try:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
28 path=inp
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
29 frame = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
30 seqlen = []
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
31 head = []
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
32 tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 )
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
33 frame = concat([chunk for chunk in tp])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
34
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
35 frcol = list(frame.columns)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
36 #print frcol[-1]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
37 if 'Unnamed' in frcol[-1]:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
38 del frcol[-1]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
39 frame=frame[frcol]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
40
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
41 frame.index = range(1,len(frame)+1)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
42
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
43 head.append('Total reads of raw data')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
44 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
45
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
46 #------------drop nulls--------------------
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
47 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
48 filtall = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
49 summ_df = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
50 filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
51
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
52 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
53 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
54 filtall.loc[filtered.index,'Reason'] = "NoResults"
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
55 frame = frame[frame['AA JUNCTION'].notnull()]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
56 frame = frame[frame['V-GENE and allele'].notnull()]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
57
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
58 head.append('Not Null CDR3/V')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
59 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
60 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
61 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
62 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
63
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
64 if psorf.startswith('y') or psorf.startswith('Y'):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
65
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
66 cc0=np.array(frame['V-GENE and allele'].unique())
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
67
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
68
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
69 for x in cc0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
70 x1=x.split('*')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
71 try:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
72 if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
73 filtered = filtered.append(frame[frame['V-GENE and allele'] == x])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
74 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
75 elif x.find('or')>-1:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
76 posa=x.count('or')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
77 x2=x.split('or')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
78 x4=''
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
79 genelist=[]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
80 for cnt in range(0, posa+1):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
81 x3=x2[cnt].split('*')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
82 x3[0]=x3[0].strip()#kobei ta space
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
83 k=x3[0].split(' ')# holds only TRBV
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
84 if cnt==0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
85 genelist.append(k[1])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
86 x4+=k[1]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
87 elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
88 genelist.append(k[1])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
89 x4+=' or '
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
90 x4+=k[1]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
91 x3=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
92 k1=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
93 genelist=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
94
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
95 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
96
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
97 else:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
98 s=x1[0].split(' ')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
99 frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
100 except IndexError as e:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
101 print('V-gene is already been formed')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
102 continue
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
103
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
104 x=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
105 x1=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
106 s=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
107
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
108 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
109 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
110 filtall.loc[filtered.index,'Reason'] = 'P or ORF'
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
111 frame = frame[frame['V-GENE and allele'].notnull()]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
112
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
113 head.append('Functional TRBV')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
114 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
115 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
116 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
117 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
118
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
119
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
120
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
121 #------------FILTERING for data quality--------------------
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
122 if con.startswith('y') or con.startswith('Y'):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
123 filtered = frame [frame['AA JUNCTION'].str.contains('X') |
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
124 frame['AA JUNCTION'].str.contains('#') |
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
125 frame['AA JUNCTION'].str.contains('[*]')]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
126
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
127
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
128
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
129 frame = frame [~frame['AA JUNCTION'].str.contains('X') &
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
130 ~frame['AA JUNCTION'].str.contains('#') &
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
131 ~frame['AA JUNCTION'].str.contains('[*]') ]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
132
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
133
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
134 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
135 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
136 filtall.loc[filtered.index,'Reason'] = 'X,#,*'
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
137 head.append('Not Containing X,#,*')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
138 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
139 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
140 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
141 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
142
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
143
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
144
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
145 if prod.startswith('y') or prod.startswith('Y'):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
146 filtered = frame[~frame['Functionality'].str.startswith('productive')]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
147 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
148 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
149 filtall.loc[filtered.index,'Reason'] = 'not productive'
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
150
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
151
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
152 frame=frame[frame['Functionality'].str.startswith('productive')]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
153
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
154 head.append('Productive')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
155 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
156 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
157
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
158 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
159
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
160
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
161 frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
162
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
163 if CF.startswith('y') or CF.startswith('Y'):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
164 if cells == 'TCR':
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
165 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
166 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
167 ~frame['AA JUNCTION'].str.endswith('F')]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
168
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
169 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
170 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
171 filtall.loc[filtered.index,'Reason'] = 'Not C..F'
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
172
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
173 frame = frame[frame['AA JUNCTION'].str.startswith('C') &
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
174 frame['AA JUNCTION'].str.endswith('F')]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
175
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
176 head.append('CDR3 landmarks C-F')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
177 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
178 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
179 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
180 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
181 elif cells == 'BCR':
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
182 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
183 filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
184 ~frame['AA JUNCTION'].str.endswith('W')]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
185
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
186 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
187 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
188 filtall.loc[filtered.index,'Reason'] = 'Not C..W'
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
189
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
190 frame = frame[frame['AA JUNCTION'].str.startswith('C') &
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
191 frame['AA JUNCTION'].str.endswith('W')]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
192
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
193 head.append('CDR3 landmarks C-W')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
194 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
195 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
196 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
197 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
198 else:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
199 print('TCR or BCR type')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
200
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
201
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
202 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
203
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
204 filtered = frame[frame['V-REGION identity %'] < Vper]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
205
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
206
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
207 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
208 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
209 filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
210
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
211 frame=frame[frame['V-REGION identity %']>= Vper]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
212 head.append('Identity >= {iden}%'.format(iden = Vper))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
213 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
214 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
215 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
216
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
217 head.append('Total filter out A')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
218 head.append('Total filter in A')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
219 seqlen.append(len(filtall))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
220 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
221
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
222 ###############################
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
223 if Vgene != 'null':
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
224
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
225 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
226
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
227 filtered = frame[frame['V-GENE and allele'] != Vgene]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
228
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
229 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
230 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
231 filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
232
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
233
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
234 frame = frame[frame['V-GENE and allele'] == Vgene]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
235
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
236
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
237
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
238 head.append('V-GENE = {} '.format(Vgene))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
239 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
240 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
241 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
242
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
243
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
244
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
245 ###############################
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
246 if (laa1 != 'null') or (laa2 != 'null'):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
247 if int(laa2) == 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
248 low = int(laa1)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
249 high = 100
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
250 elif int(laa1) > int(laa2):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
251 low = int(laa2)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
252 high = int(laa1)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
253 else:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
254 low = int(laa1)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
255 high = int(laa2)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
256
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
257 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
258 criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
259 criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
260 filtered = frame[criteria | criteria2]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
261
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
262 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
263 if int(laa2)==0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
264 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
265 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
266 else:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
267 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
268 filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
269
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
270 criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
271 criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
272 frame = frame[criteria3 & criteria4]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
273
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
274 if int(laa2)==0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
275 head.append('CDR3 length bigger than {}'.format(low))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
276 else:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
277 head.append('CDR3 length from {} to {} '.format(low,high))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
278 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
279 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
280 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
281
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
282 ###############################
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
283 if conaa != 'null':
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
284 if conaa.islower():
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
285 conaa = conaa.upper()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
286 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
287
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
288 filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
289
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
290 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
291 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
292 filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
293
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
294 frame = frame[frame['AA JUNCTION'].str.contains(conaa) ]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
295
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
296 head.append('CDR3 containing {}'.format(conaa))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
297 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
298 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
299 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
300
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
301
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
302
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
303
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
304 #####------------keep the small J gene name--------------------
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
305 #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
306 cc2=np.array(frame['J-GENE and allele'].unique())
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
307
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
308 for x in cc2:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
309 try:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
310 if notnull(x):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
311 x1=x.split('*')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
312 # print(x)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
313 # print (x1[0])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
314 trbj=x1[0].split(' ')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
315 frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
316 except IndexError as e:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
317 print('J-Gene has been formed')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
318
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
319
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
320
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
321 x=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
322 x1=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
323
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
324
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
325 #------------keep the small D gene name--------------------
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
326 cc1=np.array(frame['D-GENE and allele'].unique())
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
327 for x in cc1:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
328 try:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
329 if notnull(x):
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
330 x1=x.split('*')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
331 trbd=x1[0].split(' ')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
332 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
333 else:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
334 frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
335 except IndexError as e:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
336 print('D-gene has been formed')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
337
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
338
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
339 x=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
340 x1=None
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
341
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
342
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
343 if Jgene != 'null':
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
344
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
345 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
346
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
347 filtered = frame[frame['J-GENE and allele'] != Jgene]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
348
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
349 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
350 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
351 filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
352
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
353
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
354 frame = frame[frame['J-GENE and allele'] == Jgene]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
355
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
356
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
357
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
358 head.append('J-GENE = {} '.format(Jgene))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
359 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
360 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
361 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
362
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
363
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
364
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
365 if Dgene != 'null':
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
366
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
367 filtered = DataFrame()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
368
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
369 filtered = frame[frame['D-GENE and allele'] != Dgene]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
370
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
371 filtall = filtall.append(filtered)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
372 if len(filtall) > 0:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
373 filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
374
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
375
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
376 frame = frame[frame['D-GENE and allele'] == Dgene]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
377
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
378
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
379
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
380 head.append('D-GENE = {} '.format(Dgene))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
381 head.append('filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
382 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
383 seqlen.append(len(filtered))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
384
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
385
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
386 head.append('Total filter out')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
387 head.append('Total filter in')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
388 seqlen.append(len(filtall))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
389 seqlen.append(len(frame))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
390 summ_df = DataFrame(index = head)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
391 col = fname
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
392
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
393 summ_df[col] = seqlen
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
394 frame=frame.rename(columns = {'V-GENE and allele':'V-GENE',
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
395 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'})
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
396
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
397
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
398 frcol.append('Reason')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
399
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
400 filtall = filtall[frcol]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
401
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
402 #--------------out CSV---------------------------
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
403 frame.index = range(1,len(frame)+1)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
404 if not summ_df.empty:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
405 summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
406 return(frame,filtall,summ_df)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
407 except KeyError as e:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
408 print('This file has no ' + str(e) + ' column')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
409 return(frame,filtall,summ_df)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
410
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
411
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
412 if __name__ == '__main__':
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
413
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
414 start=time.time()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
415
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
416 # Parse input arguments
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
417 inp = sys.argv[1]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
418 cells = sys.argv[2]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
419 psorf = sys.argv[3]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
420 con = sys.argv[4]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
421 prod = sys.argv[5]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
422 CF = sys.argv[6]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
423 Vper = float(sys.argv[7])
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
424 Vgene = sys.argv[8]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
425 laa1 = sys.argv[9]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
426 conaa = sys.argv[10]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
427 filterin = sys.argv[11]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
428 filterout = sys.argv[12]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
429 Sum_table = sys.argv[13]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
430 Jgene = sys.argv[14]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
431 Dgene = sys.argv[15]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
432 laa2 = sys.argv[16]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
433 fname = sys.argv[17]
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
434
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
435 # Execute basic function
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
436 fin,fout,summ = dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname)
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
437
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
438 # Save output to CSV files
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
439 if not summ.empty:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
440 summ.to_csv(Sum_table, sep = '\t')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
441 if not fin.empty:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
442 fin.to_csv(filterin , sep = '\t')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
443 if not fout.empty:
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
444 fout.to_csv(filterout, sep= '\t')
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
445
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
446 # Print execution time
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
447 stop=time.time()
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
448 print('Runtime:' + str(stop-start))
0e37e5b73273 Initial commit
chmaramis
parents:
diff changeset
449