# HG changeset patch
# User chmaramis
# Date 1521366860 14400
# Node ID 8be019b173e6b498d7113468a89be78d17bdc54b
Uploaded included tools
diff -r 000000000000 -r 8be019b173e6 cmpb2016/appending.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/appending.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov 05 14:17:24 2014
+
+@author: chmaramis
+"""
+
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+def appending(arg):
+
+ appfr = DataFrame()
+ for path in arg:
+ frame = DataFrame()
+ tp = read_csv(path, iterator=True, chunksize=1000,sep='\t', index_col=0 )
+ frame = concat([chunk for chunk in tp])
+ appfr = appfr.append(frame)
+ appfr = appfr[frame.columns]
+ appfr.index = range(1,len(appfr)+1)
+ return appfr
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ arg=sys.argv[2:]
+ lastEl = sys.argv[1]
+
+ # Execute basic function
+ appfr = appending(arg)
+
+ # Save output to CSV files
+ if not appfr.empty:
+ appfr.to_csv(lastEl, sep= '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/appending.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/appending.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,21 @@
+
+Concatenate IMGT Report Files
+
+appending.py "${output1}"
+#for x in $imgt_files
+ "$x.igfile"
+#end for
+
+
+
+
+
+
+
+
+
+
+
+This tool concatenates two or more IMGT Report Files of the same type.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_CDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_CDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun 19 17:33:34 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+
+def cdr3Computation(inp_name, out1, t10n, fname):
+
+ frame = DataFrame()
+ tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ frame = concat([chunk for chunk in tp])
+
+ grouped = frame.groupby(['AA JUNCTION'])
+ x=grouped.size()
+ x1=DataFrame(x.index, columns=['AA JUNCTION'])
+ x1['Reads']=x.values
+ total = sum(x1['Reads'])
+ #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']]
+ x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total))
+ x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format)
+
+ final = x1.sort_values(by = ['Reads'] , ascending = False)
+
+ final.index=range(1,len(final)+1)
+ final.to_csv(out1 , sep = '\t')
+
+ numofclono = len(final)
+ clust = len(final[final['Reads'] > 1])
+ sing = len (final[final['Reads'] == 1])
+ top10 = final[['AA JUNCTION','Frequency %']].head(10)
+ top10.to_csv(t10n , sep = '\t')
+
+ summary = [[str(top10['AA JUNCTION'].values[0])]]
+ summary.append([top10['Frequency %'].values[0]])
+ summary.append([numofclono])
+ summary.append([clust,'{:.4f}'.format(100*clust/numofclono)])
+ summary.append([sing,'{:.4f}'.format(100*sing/numofclono)])
+
+ ind = ['Dominant Clonotype (CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes','Singletons']
+ spl = fname.split('_')
+ col = [spl[0],'%']
+
+ frsum = DataFrame(summary,index = ind, columns = col)
+
+ return frsum
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp_name = sys.argv[1]
+ out1 = sys.argv[2]
+ t10n = sys.argv[3]
+ sname = sys.argv[4]
+ fname = sys.argv[5]
+
+ # Execute basic function
+ frsum = cdr3Computation(inp_name,out1,t10n,fname)
+
+ # Save output to CSV files
+ if not frsum.empty:
+ frsum.to_csv(sname, sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_CDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_CDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,19 @@
+
+ Compute CDR3 clonotypes
+ comp_clono_CDR3.py $input $clonos $topcl $summ ${input.name}
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the CDR3 clonotypes.
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_JCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_JCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun 19 17:33:34 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+def clonotypeComputationJ(inp_name,out1,t10n,fname):
+
+ frame = DataFrame()
+ tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ frame = concat([chunk for chunk in tp])
+
+ grouped = frame.groupby(['J-GENE','AA JUNCTION'])
+ x=grouped.size()
+ x1=DataFrame(list(x.index), columns=['J-GENE','AA JUNCTION'])
+ x1['Reads']=x.values
+ total = sum(x1['Reads'])
+ #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']]
+ x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total))
+ x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format)
+
+ final = x1.sort_values(by = ['Reads'] , ascending = False)
+
+ final.index=range(1,len(final)+1)
+ final.to_csv(out1 , sep = '\t')
+
+ numofclono = len(final)
+ clust = len(final[final['Reads'] > 1])
+ sing = len (final[final['Reads'] == 1])
+ top10 = final[['J-GENE','AA JUNCTION','Frequency %']].head(10)
+ top10.to_csv(t10n , sep = '\t')
+
+ summary = [[str(top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]]
+ summary.append([top10['Frequency %'].values[0]])
+ summary.append([numofclono])
+ summary.append([clust,'{:.4f}'.format(100*clust/numofclono)])
+ summary.append([sing,'{:.4f}'.format(100*sing/numofclono)])
+
+
+ ind = ['Dominant Clonotype (J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons']
+ spl = fname.split('_')
+ col = [spl[0],'%']
+
+ frsum = DataFrame(summary,index = ind, columns = col)
+
+ return frsum
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp_name = sys.argv[1]
+ out1 = sys.argv[2]
+ t10n = sys.argv[3]
+ sname = sys.argv[4]
+ fname = sys.argv[5]
+
+ # Execute basic function
+ frsum = clonotypeComputationJ(inp_name,out1,t10n,fname)
+
+ # Save output to CSV files
+ if not frsum.empty:
+ frsum.to_csv(sname, sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_JCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_JCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,24 @@
+
+ Compute J+CDR3 clonotypes
+ comp_clono_JCDR3.py $input $clonos $topcl $summ2 ${input.name}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the (J-gene, CDR3) clonotypes and their frequencies.
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_VCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun 19 17:33:34 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+def clonotypeComputation(inp_name, out1, t10n, fname):
+
+ frame = DataFrame()
+ tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ frame = concat([chunk for chunk in tp])
+
+
+ grouped = frame.groupby(['V-GENE','AA JUNCTION'])
+ x=grouped.size()
+ x1=DataFrame(list(x.index), columns=['V-GENE','AA JUNCTION'])
+ x1['Reads']=x.values
+ total = sum(x1['Reads'])
+ #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']]
+ x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total))
+ x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format)
+
+ final = x1.sort_values(by = ['Reads'] , ascending = False)
+
+ final.index=range(1,len(final)+1)
+ final.to_csv(out1 , sep = '\t')
+
+ numofclono = len(final)
+ clust = len(final[final['Reads'] > 1])
+ sing = len (final[final['Reads'] == 1])
+ top10 = final[['V-GENE','AA JUNCTION','Frequency %']].head(10)
+ top10.to_csv(t10n , sep = '\t')
+
+ summary = [[str(top10['V-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]]
+ summary.append([top10['Frequency %'].values[0]])
+ summary.append([numofclono])
+ summary.append([clust,'{:.4f}'.format(100*clust/numofclono)])
+ summary.append([sing,'{:.4f}'.format(100*sing/numofclono)])
+
+ ind = ['Dominant Clonotype (V+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons']
+ spl = fname.split('_')
+ col = [spl[0],'%']
+
+ frsum = DataFrame(summary,index = ind, columns = col)
+
+ return frsum
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp_name = sys.argv[1]
+ out1 = sys.argv[2]
+ t10n = sys.argv[3]
+ sname = sys.argv[4]
+ fname = sys.argv[5]
+
+ # Execute basic function
+ frsum = clonotypeComputation(inp_name,out1,t10n,fname)
+
+ # Save output to CSV files
+ if not frsum.empty:
+ frsum.to_csv(sname, sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_VCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,24 @@
+
+ Compute V+CDR3 clonotypes
+ comp_clono_VCDR3.py $input $clonos $topcl $summ2 ${input.name}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the (V-gene, CDR3) clonotypes and their frequencies.
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VDJCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_VDJCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Dec 3 14:54:00 2015
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+def clonotypeComputationVDJ(inp_name,out1,t10n,fname):
+
+ frame = DataFrame()
+ tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ frame = concat([chunk for chunk in tp])
+
+ grouped = frame.groupby(['V-GENE','D-GENE','J-GENE','AA JUNCTION'])
+ x=grouped.size()
+ x1=DataFrame(list(x.index), columns=['V-GENE','D-GENE','J-GENE','AA JUNCTION'])
+ x1['Reads']=x.values
+ total = sum(x1['Reads'])
+ #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']]
+ x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total))
+ x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format)
+
+ final = x1.sort_values(by = ['Reads'] , ascending = False)
+ #final = x1.sort_values(by = ['Reads'] , ascending = False)
+
+ final.index=range(1,len(final)+1)
+ final.to_csv(out1 , sep = '\t')
+
+ numofclono = len(final)
+ clust = len(final[final['Reads'] > 1])
+ sing = len (final[final['Reads'] == 1])
+ top10 = final[['V-GENE','D-GENE','J-GENE','AA JUNCTION','Frequency %']].head(10)
+ top10.to_csv(t10n , sep = '\t')
+
+ summary = [[str(top10['V-GENE'].values[0]+','+top10['D-GENE'].values[0]+','+top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]]
+ summary.append([top10['Frequency %'].values[0]])
+ summary.append([numofclono])
+ summary.append([clust,'{:.4f}'.format(100*clust/numofclono)])
+ summary.append([sing,'{:.4f}'.format(100*sing/numofclono)])
+
+
+ ind = ['Dominant Clonotype (V+D+J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons']
+ spl = fname.split('_')
+ col = [spl[0],'%']
+
+ frsum = DataFrame(summary,index = ind, columns = col)
+
+ return frsum
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp_name = sys.argv[1]
+ out1 = sys.argv[2]
+ t10n = sys.argv[3]
+ sname = sys.argv[4]
+ fname = sys.argv[5]
+
+ # Execute basic function
+ frsum = clonotypeComputationVDJ(inp_name,out1,t10n,fname)
+
+ # Save output to CSV files
+ if not frsum.empty:
+ frsum.to_csv(sname, sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VDJCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_VDJCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,24 @@
+
+ Compute V+D+J+CDR3 clonotypes
+ comp_clono_VDJCDR3.py $input $clonos $topcl $summ2 ${input.name}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the (V-gene, D-gene, J-gene, CDR3) clonotypes and their frequencies.
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VJCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_VJCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 23 17:33:34 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+def clonotypeComputationVJ(inp_name,out1,t10n,fname):
+
+ frame = DataFrame()
+ tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ frame = concat([chunk for chunk in tp])
+
+ grouped = frame.groupby(['V-GENE','J-GENE','AA JUNCTION'])
+ x=grouped.size()
+ x1=DataFrame(list(x.index), columns=['V-GENE','J-GENE','AA JUNCTION'])
+ x1['Reads']=x.values
+ total = sum(x1['Reads'])
+ #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']]
+ x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total))
+ x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format)
+
+ final = x1.sort_values(by = ['Reads'] , ascending = False)
+ #final = x1.sort_values(by = ['Reads'] , ascending = False)
+
+ final.index= range(1,len(final)+1)
+ final.to_csv(out1 , sep = '\t')
+
+ numofclono = len(final)
+ clust = len(final[final['Reads'] > 1])
+ sing = len (final[final['Reads'] == 1])
+ top10 = final[['V-GENE','J-GENE','AA JUNCTION','Frequency %']].head(10)
+ top10.to_csv(t10n , sep = '\t')
+
+ summary = [[str(top10['V-GENE'].values[0]+','+top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]]
+ summary.append([top10['Frequency %'].values[0]])
+ summary.append([numofclono])
+ summary.append([clust,'{:.4f}'.format(100*clust/numofclono)])
+ summary.append([sing,'{:.4f}'.format(100*sing/numofclono)])
+
+
+ ind = ['Dominant Clonotype (V+J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons']
+ spl = fname.split('_')
+ col = [spl[0],'%']
+
+ frsum = DataFrame(summary,index = ind, columns = col)
+
+ return frsum
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp_name = sys.argv[1]
+ out1 = sys.argv[2]
+ t10n = sys.argv[3]
+ sname = sys.argv[4]
+ fname = sys.argv[5]
+
+ # Execute basic function
+ frsum = clonotypeComputationVJ(inp_name,out1,t10n,fname)
+
+ # Save output to CSV files
+ if not frsum.empty:
+ frsum.to_csv(sname, sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VJCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/comp_clono_VJCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,24 @@
+
+ Compute V+J+CDR3 clonotypes
+ comp_clono_VJCDR3.py $input $clonos $topcl $summ2 ${input.name}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the (V-gene, J-gene, CDR3) clonotypes and their frequencies.
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_J.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/compare_repertoire_J.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb 29 10:18:39 2016
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+sw_reads = lambda x: x.startswith('Reads')
+sw_freq = lambda x: x.startswith('Freq')
+sw_gene = lambda x: x.startswith('J')
+
+def freqtoall(inputs):
+
+ mer=DataFrame()
+
+ for x in range(0,len(inputs),2):
+
+ ini = read_csv(inputs[x] , sep = '\t' , index_col = 0)
+
+ ini.drop(ini.columns[np.where(ini.columns.map(sw_reads))[0]], axis=1, inplace=True)
+
+ x1 = inputs[x+1].split('_')
+ ini.rename(columns={ini.columns[np.where(ini.columns.map(sw_freq))[0][0]]: x1[0]}, inplace=True)
+
+ if mer.empty:
+ mer = DataFrame(ini)
+ else:
+ mer = merge(mer,ini, on=ini.columns[np.where(ini.columns.map(sw_gene))[0][0]] , how='outer')
+
+ mer=mer.fillna(0)
+ mer['mean'] = mer.sum(axis=1)/(len(mer.columns)-1)
+ fr = 'mean'
+
+ mer=mer.sort_values(by = fr,ascending=False)
+ mer[fr] = mer[fr].map('{:.4f}'.format)
+ mer.index = range(1,len(mer)+1)
+
+ return mer
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inputs = sys.argv[2:]
+ output = sys.argv[1]
+
+ # Execute basic function
+ mer = freqtoall(inputs)
+
+ # Save output to CSV files
+ if not mer.empty:
+ mer.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_J.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/compare_repertoire_J.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,21 @@
+
+Compare J-gene repertoires
+
+compare_repertoire_J.py "${output1}"
+#for x in $rep_files
+ "$x.rpfile"
+ "$x.rpfile.name"
+#end for
+
+
+
+
+
+
+
+
+
+
+This tool produces a union of all patients' J-gene repertoires and computes the mean frequency of each J-gene.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_V.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/compare_repertoire_V.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Sep 16 12:50:43 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+sw_reads = lambda x: x.startswith('Reads')
+sw_freq = lambda x: x.startswith('Freq')
+sw_gene = lambda x: x.startswith('V')
+
+def freqtoall(inputs):
+
+ mer=DataFrame()
+
+ for x in range(0,len(inputs),2):
+
+ ini = read_csv(inputs[x] , sep = '\t' , index_col = 0)
+
+ ini.drop(ini.columns[np.where(ini.columns.map(sw_reads))[0]], axis=1, inplace=True)
+
+ x1 = inputs[x+1].split('_')
+ ini.rename(columns={ini.columns[np.where(ini.columns.map(sw_freq))[0][0]]: x1[0]}, inplace=True)
+
+ if mer.empty:
+ mer = DataFrame(ini)
+ else:
+ mer = merge(mer,ini, on=ini.columns[np.where(ini.columns.map(sw_gene))[0][0]] , how='outer')
+
+ mer=mer.fillna(0)
+ mer['mean'] = mer.sum(axis=1)/(len(mer.columns)-1)
+ fr = 'mean'
+
+ mer=mer.sort_values(by = fr,ascending=False)
+ mer[fr] = mer[fr].map('{:.4f}'.format)
+ mer.index = range(1,len(mer)+1)
+
+ return mer
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inputs = sys.argv[2:]
+ output = sys.argv[1]
+
+ # Execute basic function
+ mer = freqtoall(inputs)
+
+ # Save output to CSV files
+ if not mer.empty:
+ mer.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_V.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/compare_repertoire_V.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,21 @@
+
+Compare V-gene repertoires
+
+compare_repertoire_V.py "${output1}"
+#for x in $rep_files
+ "$x.rpfile"
+ "$x.rpfile.name"
+#end for
+
+
+
+
+
+
+
+
+
+
+This tool produces a union of all patients' V-gene repertoires and computes the mean frequency of each V-gene.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_CDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/exclus_clono_CDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb 29 11:12:09 2016
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+
+
+def exclusiveCDR3Func(inputs,thres):
+
+ cdr3=DataFrame()
+
+ # File A
+ cl = DataFrame()
+ cl = read_csv(inputs[0] , sep = '\t' , index_col = 0)
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+ cdr3 = cl
+
+ # File B
+ cl = DataFrame()
+ cl = read_csv(inputs[2] , sep = '\t' , index_col = 0)
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+ cl.rename(columns={'Reads':'ReadsB'}, inplace=True)
+ cdr3 = cdr3.merge(cl[['AA JUNCTION','ReadsB']], how='left', on='AA JUNCTION')
+
+ cdr3['ReadsB'].fillna(0, inplace=True)
+
+ cdr3 = cdr3[cdr3['ReadsB'] == 0]
+ del cdr3['ReadsB']
+
+ cdr3.index = range(1,len(cdr3)+1)
+
+ return cdr3
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ threshold = sys.argv[2]
+ arg = sys.argv[3:]
+ output = sys.argv[1]
+
+ # Execute basic function
+ excl = exclusiveCDR3Func(arg,threshold)
+
+ # Save output to CSV files
+ if not excl.empty:
+ excl.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_CDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/exclus_clono_CDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,33 @@
+
+Compute Exclusive CDR3 Clonotypes
+
+exclus_clono_CDR3.py "$output1" "$Th.thres" "$inputA" "$inputA.name" "$inputB" "$inputB.name"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the exclisive CDR3 clonotypes of patient or group A that are absent from patient or group B.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_JCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/exclus_clono_JCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb 29 17:06:09 2016
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+
+def exclusiveJclonoFunc(inputs,thres):
+
+ jClono=DataFrame()
+
+ # File A
+ cl = DataFrame()
+ cl = read_csv(inputs[0] , sep = '\t' , index_col = 0)
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+ jClono = cl
+
+ # File B
+ cl = DataFrame()
+ cl = read_csv(inputs[2] , sep = '\t' , index_col = 0)
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+ cl.rename(columns={'Reads':'ReadsB'}, inplace=True)
+ jClono = jClono.merge(cl[['J-GENE','AA JUNCTION','ReadsB']], how='left', on=['J-GENE','AA JUNCTION'])
+
+ jClono['ReadsB'].fillna(0, inplace=True)
+
+ jClono = jClono[jClono['ReadsB'] == 0]
+ del jClono['ReadsB']
+
+ jClono.index = range(1,len(jClono)+1)
+
+ return jClono
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ threshold = sys.argv[2]
+ arg = sys.argv[3:]
+ output = sys.argv[1]
+
+ # Execute basic function
+ excl = exclusiveJclonoFunc(arg,threshold)
+
+ # Save output to CSV files
+ if not excl.empty:
+ excl.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_JCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/exclus_clono_JCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,33 @@
+
+Compute Exclusive J+CDR3 Clonotypes
+
+exclus_clono_JCDR3.py "$output1" "$Th.thres" "$inputA" "$inputA.name" "$inputB" "$inputB.name"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the exclisive (J-gene, CDR3) clonotypes of patient or group A that are absent from patient or group B.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_VCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/exclus_clono_VCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb 29 16:57:12 2016
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+
+def exclusiveVclonoFunc(inputs,thres):
+
+ vClono=DataFrame()
+
+ # File A
+ cl = DataFrame()
+ cl = read_csv(inputs[0] , sep = '\t' , index_col = 0)
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+ vClono = cl
+
+ # File B
+ cl = DataFrame()
+ cl = read_csv(inputs[2] , sep = '\t' , index_col = 0)
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+ cl.rename(columns={'Reads':'ReadsB'}, inplace=True)
+ vClono = vClono.merge(cl[['V-GENE','AA JUNCTION','ReadsB']], how='left', on=['V-GENE','AA JUNCTION'])
+
+ vClono['ReadsB'].fillna(0, inplace=True)
+
+ vClono = vClono[vClono['ReadsB'] == 0]
+ del vClono['ReadsB']
+
+ vClono.index = range(1,len(vClono)+1)
+
+ return vClono
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ threshold = sys.argv[2]
+ arg = sys.argv[3:]
+ output = sys.argv[1]
+
+ # Execute basic function
+ excl = exclusiveVclonoFunc(arg,threshold)
+
+ # Save output to CSV files
+ if not excl.empty:
+ excl.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_VCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/exclus_clono_VCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,33 @@
+
+Compute Exclusive V+CDR3 Clonotypes
+
+exclus_clono_VCDR3.py "$output1" "$Th.thres" "$inputA" "$inputA.name" "$inputB" "$inputB.name"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the exclisive (V-gene, CDR3) clonotypes of patient or group A that are absent from patient or group B.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_J.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/ext_repertoire_J.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 20 14:58:08 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+def repertoireJgComputation(inp_name, fname):
+
+ df = DataFrame()
+ df = read_csv(inp_name, sep='\t', index_col=0 )
+ #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ #df = concat([chunk for chunk in tp])
+
+ vgroup = df.groupby(['J-GENE'])
+ vdi = vgroup.size()
+ rep = DataFrame(list(vdi.index), columns=['J-GENE'])
+ rep['Reads'] = vdi.values
+ #rep['Reads/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values]
+ rep['Reads/Total'] = rep['Reads'].map(ft.partial(frm, y=len(df)))
+ rep['Frequency %'] = (100*rep['Reads']/len(df)).map('{:.4f}'.format)
+
+ rep = rep.sort_values(by = ['Reads'] , ascending = False)
+
+ rep.index = range(1,len(rep)+1)
+
+ su = rep[['J-GENE','Frequency %']].head(10)
+ spl = fname.split('_')
+ summdf = DataFrame([su['J-GENE'].values[0],su['Frequency %'].values[0]],
+ index = ['Dominant J-GENE','Frequency'], columns = [spl[0]])
+ summdf['%'] = ''
+
+ return (rep, summdf)
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp_name = sys.argv[1]
+ outrep = sys.argv[2]
+ summ_rep2 = sys.argv[3]
+ fname = sys.argv[4]
+
+ # Execute basic function
+ rep, summdf = repertoireJgComputation(inp_name, fname)
+
+ # Save output to CSV files
+ if not rep.empty:
+ rep.to_csv(outrep, sep = '\t')
+ if not summdf.empty:
+ summdf.to_csv(summ_rep2, sep = '\t')
+
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_J.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/ext_repertoire_J.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,24 @@
+
+ Compute repertoire of J-genes
+ ext_repertoire_J.py $input $clonos $summ ${input.name}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the repertoire of J-genes (i.e. , the number of clonotypes using each V-gene over the total number of clonotypes).
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_V.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/ext_repertoire_V.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 20 14:58:08 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+def repertoireComputation(inp_name, fname):
+
+ df = DataFrame()
+ df = read_csv(inp_name, sep='\t', index_col=0 )
+ #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ #df = concat([chunk for chunk in tp])
+
+
+ vgroup = df.groupby(['V-GENE'])
+ vdi = vgroup.size()
+ rep = DataFrame(list(vdi.index), columns=['V-GENE'])
+ rep['Reads'] = vdi.values
+ #rep['Reads/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values]
+ rep['Reads/Total'] = rep['Reads'].map(ft.partial(frm, y=len(df)))
+ rep['Frequency %'] = (100*rep['Reads']/len(df)).map('{:.4f}'.format)
+
+ rep = rep.sort_values(by = ['Reads'] , ascending = False)
+ rep.index = range(1,len(rep)+1)
+
+ su = rep[['V-GENE','Frequency %']].head(10)
+ spl = fname.split('_')
+ summdf = DataFrame([su['V-GENE'].values[0],su['Frequency %'].values[0]],
+ index = ['Dominant V-GENE','Frequency'], columns = [spl[0]])
+ summdf['%'] = ''
+
+ return (rep, su, summdf)
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp_name = sys.argv[1]
+ outrep = sys.argv[2]
+ summ_rep = sys.argv[3]
+ summ_rep2 = sys.argv[4]
+ fname = sys.argv[5]
+
+ # Execute basic function
+ rep, su, summdf = repertoireComputation(inp_name, fname)
+
+ # Save output to CSV files
+ if not rep.empty:
+ rep.to_csv(outrep, sep = '\t')
+ if not su.empty:
+ su.to_csv(summ_rep, sep = '\t')
+ if not summdf.empty:
+ summdf.to_csv(summ_rep2, sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_V.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/ext_repertoire_V.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,24 @@
+
+ Compute repertoire of V-genes
+ ext_repertoire_V.py $input $clonos $topcl $summ ${input.name}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the repertoire of V-genes (i.e. , the number of clonotypes using each V-gene over the total number of clonotypes).
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/ngs_filtering.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/ngs_filtering.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,452 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 4 18:41:42 2013
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import string as strpy
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import time
+import sys
+
+
+def filter_condition_AAjunction(x):
+ x= x.strip()
+ if ' ' in x:
+ return x.split(' ')[0]
+ else:
+ return x
+
+#-----------frame creation---------------------
+def filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname):
+
+ try:
+ path=inp
+ frame = DataFrame()
+ seqlen = []
+ head = []
+ tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ frame = concat([chunk for chunk in tp])
+
+ frcol = list(frame.columns)
+ #print frcol[-1]
+ if 'Unnamed' in frcol[-1]:
+ del frcol[-1]
+ frame=frame[frcol]
+
+ frame.index = range(1,len(frame)+1)
+
+ head.append('Total reads of raw data')
+ seqlen.append(len(frame))
+
+ #------------drop nulls--------------------
+ filtered = DataFrame()
+ filtall = DataFrame()
+ summ_df = DataFrame()
+ filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])]
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = "NoResults"
+ frame = frame[frame['AA JUNCTION'].notnull()]
+ frame = frame[frame['V-GENE and allele'].notnull()]
+
+ head.append('Not Null CDR3/V')
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+ filtered = DataFrame()
+
+ if psorf.startswith('y') or psorf.startswith('Y'):
+
+ cc0=np.array(frame['V-GENE and allele'].unique())
+
+
+ for x in cc0:
+ x1=x.split('*')
+ try:
+ if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1):
+ filtered = filtered.append(frame[frame['V-GENE and allele'] == x])
+ frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA)
+ elif x.find('or')>-1:
+ posa=x.count('or')
+ x2=x.split('or')
+ x4=''
+ genelist=[]
+ for cnt in range(0, posa+1):
+ x3=x2[cnt].split('*')
+ x3[0]=x3[0].strip()#kobei ta space
+ k=x3[0].split(' ')# holds only TRBV
+ if cnt==0:
+ genelist.append(k[1])
+ x4+=k[1]
+ elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3
+ genelist.append(k[1])
+ x4+=' or '
+ x4+=k[1]
+ x3=None
+ k1=None
+ genelist=None
+
+ frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4)
+
+ else:
+ s=x1[0].split(' ')
+ frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1])
+ except IndexError as e:
+ print('V-gene is already been formed')
+ continue
+
+ x=None
+ x1=None
+ s=None
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'P or ORF'
+ frame = frame[frame['V-GENE and allele'].notnull()]
+
+ head.append('Functional TRBV')
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+ filtered = DataFrame()
+
+
+
+ #------------FILTERING for data quality--------------------
+ if con.startswith('y') or con.startswith('Y'):
+ filtered = frame [frame['AA JUNCTION'].str.contains('X') |
+ frame['AA JUNCTION'].str.contains('#') |
+ frame['AA JUNCTION'].str.contains('[*]')]
+
+
+
+ frame = frame [~frame['AA JUNCTION'].str.contains('X') &
+ ~frame['AA JUNCTION'].str.contains('#') &
+ ~frame['AA JUNCTION'].str.contains('[*]') ]
+
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'X,#,*'
+ head.append('Not Containing X,#,*')
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+ filtered = DataFrame()
+
+ # Set label of functionality column, taking into account current & past IMGT Summary column label
+ functionality_label = 'Functionality'
+ if 'V-DOMAIN Functionality' in frame.columns:
+ functionality_label = 'V-DOMAIN Functionality'
+
+ if prod.startswith('y') or prod.startswith('Y'):
+ filtered = frame[~frame[functionality_label].str.startswith('productive')]
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'not productive'
+
+
+ frame=frame[frame[functionality_label].str.startswith('productive')]
+
+ head.append('Productive')
+ head.append('filter out')
+ seqlen.append(len(frame))
+
+ seqlen.append(len(filtered))
+
+
+ frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction)
+
+ if CF.startswith('y') or CF.startswith('Y'):
+ if cells == 'TCR':
+ filtered = DataFrame()
+ filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
+ ~frame['AA JUNCTION'].str.endswith('F')]
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'Not C..F'
+
+ frame = frame[frame['AA JUNCTION'].str.startswith('C') &
+ frame['AA JUNCTION'].str.endswith('F')]
+
+ head.append('CDR3 landmarks C-F')
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+ filtered = DataFrame()
+ elif cells == 'BCR':
+ filtered = DataFrame()
+ filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
+ ~frame['AA JUNCTION'].str.endswith('W')]
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'Not C..W'
+
+ frame = frame[frame['AA JUNCTION'].str.startswith('C') &
+ frame['AA JUNCTION'].str.endswith('W')]
+
+ head.append('CDR3 landmarks C-W')
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+ filtered = DataFrame()
+ else:
+ print('TCR or BCR type')
+
+
+ filtered = DataFrame()
+
+ filtered = frame[frame['V-REGION identity %'] < Vper]
+
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper)
+
+ frame=frame[frame['V-REGION identity %']>= Vper]
+ head.append('Identity >= {iden}%'.format(iden = Vper))
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+
+ head.append('Total filter out A')
+ head.append('Total filter in A')
+ seqlen.append(len(filtall))
+ seqlen.append(len(frame))
+
+ ###############################
+ if Vgene != 'null':
+
+ filtered = DataFrame()
+
+ filtered = frame[frame['V-GENE and allele'] != Vgene]
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene)
+
+
+ frame = frame[frame['V-GENE and allele'] == Vgene]
+
+
+
+ head.append('V-GENE = {} '.format(Vgene))
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+
+
+
+ ###############################
+ if (laa1 != 'null') or (laa2 != 'null'):
+ if int(laa2) == 0:
+ low = int(laa1)
+ high = 100
+ elif int(laa1) > int(laa2):
+ low = int(laa2)
+ high = int(laa1)
+ else:
+ low = int(laa1)
+ high = int(laa2)
+
+ filtered = DataFrame()
+ criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low)
+ criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high)
+ filtered = frame[criteria | criteria2]
+
+ filtall = filtall.append(filtered)
+ if int(laa2)==0:
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low)
+ else:
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high)
+
+ criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low)
+ criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high)
+ frame = frame[criteria3 & criteria4]
+
+ if int(laa2)==0:
+ head.append('CDR3 length bigger than {}'.format(low))
+ else:
+ head.append('CDR3 length from {} to {} '.format(low,high))
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+
+ ###############################
+ if conaa != 'null':
+ if conaa.islower():
+ conaa = conaa.upper()
+ filtered = DataFrame()
+
+ filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)]
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa)
+
+ frame = frame[frame['AA JUNCTION'].str.contains(conaa) ]
+
+ head.append('CDR3 containing {}'.format(conaa))
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+
+
+
+
+ #####------------keep the small J gene name--------------------
+ #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene)
+ cc2=np.array(frame['J-GENE and allele'].unique())
+
+ for x in cc2:
+ try:
+ if notnull(x):
+ x1=x.split('*')
+ # print(x)
+ # print (x1[0])
+ trbj=x1[0].split(' ')
+ frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1])
+ except IndexError as e:
+ print('J-Gene has been formed')
+
+
+
+ x=None
+ x1=None
+
+
+ #------------keep the small D gene name--------------------
+ cc1=np.array(frame['D-GENE and allele'].unique())
+ for x in cc1:
+ try:
+ if notnull(x):
+ x1=x.split('*')
+ trbd=x1[0].split(' ')
+ frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1])
+ else:
+ frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none')
+ except IndexError as e:
+ print('D-gene has been formed')
+
+
+ x=None
+ x1=None
+
+
+ if Jgene != 'null':
+
+ filtered = DataFrame()
+
+ filtered = frame[frame['J-GENE and allele'] != Jgene]
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene)
+
+
+ frame = frame[frame['J-GENE and allele'] == Jgene]
+
+
+
+ head.append('J-GENE = {} '.format(Jgene))
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+
+
+
+ if Dgene != 'null':
+
+ filtered = DataFrame()
+
+ filtered = frame[frame['D-GENE and allele'] != Dgene]
+
+ filtall = filtall.append(filtered)
+ if len(filtall) > 0:
+ filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene)
+
+
+ frame = frame[frame['D-GENE and allele'] == Dgene]
+
+
+
+ head.append('D-GENE = {} '.format(Dgene))
+ head.append('filter out')
+ seqlen.append(len(frame))
+ seqlen.append(len(filtered))
+
+
+ head.append('Total filter out')
+ head.append('Total filter in')
+ seqlen.append(len(filtall))
+ seqlen.append(len(frame))
+ summ_df = DataFrame(index = head)
+ col = fname
+
+ summ_df[col] = seqlen
+ frame=frame.rename(columns = {'V-GENE and allele':'V-GENE',
+ 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'})
+
+
+ frcol.append('Reason')
+
+ filtall = filtall[frcol]
+
+ #--------------out CSV---------------------------
+ frame.index = range(1,len(frame)+1)
+ if not summ_df.empty:
+ summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format))
+ return(frame,filtall,summ_df)
+ except KeyError as e:
+ print('This file has no ' + str(e) + ' column')
+ return(frame,filtall,summ_df)
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ inp = sys.argv[1]
+ cells = sys.argv[2]
+ psorf = sys.argv[3]
+ con = sys.argv[4]
+ prod = sys.argv[5]
+ CF = sys.argv[6]
+ Vper = float(sys.argv[7])
+ Vgene = sys.argv[8]
+ laa1 = sys.argv[9]
+ conaa = sys.argv[10]
+ filterin = sys.argv[11]
+ filterout = sys.argv[12]
+ Sum_table = sys.argv[13]
+ Jgene = sys.argv[14]
+ Dgene = sys.argv[15]
+ laa2 = sys.argv[16]
+ fname = sys.argv[17]
+
+ # Execute basic function
+ fin,fout,summ = filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname)
+
+ # Save output to CSV files
+ if not summ.empty:
+ summ.to_csv(Sum_table, sep = '\t')
+ if not fin.empty:
+ fin.to_csv(filterin , sep = '\t')
+ if not fout.empty:
+ fout.to_csv(filterout, sep= '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/ngs_filtering.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/ngs_filtering.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,139 @@
+
+ Filter IMGT Summary Data
+ ngs_filtering.py $input $TCR_or_BCR $Vfun $spChar $prod $delCF $threshold $Vg.Vgid $clen.cdr3len1 $cdp.cdr3part $filtin $filtout $summ $Jg.Jgid $Dg.Dgid $clen.cdr3len2 $process_id
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool filters IMGT Summary Data based on a combination of criteria.
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_CDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/pub_clono_CDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 21 18:26:01 2015
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+
+
+def publicCDR3Func(inputs,thres):
+
+ cdr3=DataFrame()
+
+ for x in range(0,len(inputs),2):
+ cl = DataFrame()
+ cl = read_csv(inputs[x] , sep = '\t' , index_col = 0)
+ #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ #cl = concat([chunk for chunk in tp])
+
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+
+ x1 = inputs[x+1].split('_')
+
+ del cl['Reads']
+ cl.columns = [cl.columns[0], x1[0]+' '+cl.columns[1], x1[0]+' Relative '+cl.columns[2]]
+
+ if cdr3.empty:
+ cdr3 = cl
+ else:
+ cdr3 = cdr3.merge(cl, how='outer', on='AA JUNCTION')
+
+
+ col = cdr3.columns
+ freqs = col.map(lambda x: 'Frequency' in x)
+ reads = col.map(lambda x: 'Reads/Total' in x)
+
+ cdr3[col[freqs]] = cdr3[col[freqs]].fillna(0)
+ cdr3[col[reads]] = cdr3[col[reads]].fillna('0/*')
+
+ cdr3['Num of Patients']= cdr3[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1)
+
+ cdr3 = cdr3[cdr3['Num of Patients'] > 1]
+
+ cdr3.index = range(1,len(cdr3)+1)
+
+ return cdr3
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ threshold = sys.argv[2]
+ arg = sys.argv[3:]
+ output = sys.argv[1]
+
+ # Execute basic function
+ mer = publicCDR3Func(arg,threshold)
+
+ # Save output to CSV files
+ if not mer.empty:
+ mer.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_CDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/pub_clono_CDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,39 @@
+
+Compute Public CDR3 Clonotypes
+
+pub_clono_CDR3.py "$output1" "$Th.thres"
+#for x in $clono_files
+ "$x.clfile"
+ "$x.clfile.name"
+#end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool computes the public CDR3 clonotypes among a number of patients along with their frequencies for each patient.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_JCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/pub_clono_JCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 26 19:08:28 2016
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+
+
+def publicJGeneClonoFunc(inputs,thres):
+
+ clono=DataFrame()
+
+ for x in range(0,len(inputs),2):
+ cl = DataFrame()
+ cl = read_csv(inputs[x] , sep = '\t' , index_col = 0)
+ #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ #cl = concat([chunk for chunk in tp])
+
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+
+ x1 = inputs[x+1].split('_')
+
+ del cl['Reads']
+ cl.columns = [cl.columns[0], cl.columns[1], x1[0]+' '+cl.columns[2], x1[0]+' Relative '+cl.columns[3]]
+
+ if clono.empty:
+ clono = cl
+ else:
+ clono = clono.merge(cl, how='outer', on=['J-GENE','AA JUNCTION'])
+
+
+ col = clono.columns
+ freqs = col.map(lambda x: 'Frequency' in x)
+ reads = col.map(lambda x: 'Reads/Total' in x)
+
+ clono[col[freqs]] = clono[col[freqs]].fillna(0)
+ clono[col[reads]] = clono[col[reads]].fillna('0/*')
+
+ clono['Num of Patients']= clono[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1)
+
+ clono = clono[clono['Num of Patients'] > 1]
+
+ clono.index = range(1,len(clono)+1)
+
+ return clono
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ arg = sys.argv[3:]
+ output = sys.argv[1]
+ thres = sys.argv[2]
+
+
+ # Execute basic function
+ mer = publicJGeneClonoFunc(arg,thres)
+
+ # Save output to CSV files
+ if not mer.empty:
+ mer.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_JCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/pub_clono_JCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,39 @@
+
+Compute Public J+CDR3 Clonotypes
+
+pub_clono_JCDR3.py "$output1" "$Th.thres"
+#for x in $clono_files
+ "$x.clfile"
+ "$x.clfile.name"
+#end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This tool computes the public (J-gene, CDR3) Clonotypes among a number of patients along with their frequencies for each patient.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_VCDR3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/pub_clono_VCDR3.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 26 18:49:15 2016
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+
+
+def publicVGeneClonoFunc(inputs,thres):
+
+ clono=DataFrame()
+
+ for x in range(0,len(inputs),2):
+ cl = DataFrame()
+ cl = read_csv(inputs[x] , sep = '\t' , index_col = 0)
+ #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+ #cl = concat([chunk for chunk in tp])
+
+ if (thres != 'null'):
+ cl = cl[cl['Reads'] > int(thres)]
+
+ x1 = inputs[x+1].split('_')
+
+ del cl['Reads']
+ cl.columns = [cl.columns[0], cl.columns[1], x1[0]+' '+cl.columns[2], x1[0]+' Relative '+cl.columns[3]]
+
+ if clono.empty:
+ clono = cl
+ else:
+ clono = clono.merge(cl, how='outer', on=['V-GENE','AA JUNCTION'])
+
+
+ col = clono.columns
+ freqs = col.map(lambda x: 'Frequency' in x)
+ reads = col.map(lambda x: 'Reads/Total' in x)
+
+ clono[col[freqs]] = clono[col[freqs]].fillna(0)
+ clono[col[reads]] = clono[col[reads]].fillna('0/*')
+
+ clono['Num of Patients']= clono[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1)
+
+ clono = clono[clono['Num of Patients'] > 1]
+
+ clono.index = range(1,len(clono)+1)
+
+ return clono
+
+
+if __name__ == '__main__':
+
+ start=time.time()
+
+ # Parse input arguments
+ arg = sys.argv[3:]
+ output = sys.argv[1]
+ thres = sys.argv[2]
+
+
+ # Execute basic function
+ mer = publicVGeneClonoFunc(arg,thres)
+
+ # Save output to CSV files
+ if not mer.empty:
+ mer.to_csv(output , sep = '\t')
+
+ # Print execution time
+ stop=time.time()
+ print('Runtime:' + str(stop-start))
diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_VCDR3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/pub_clono_VCDR3.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,39 @@
+
+Compute Public V+CDR3 Clonotypes
+
+pub_clono_VCDR3.py "$output1" "$Th.thres"
+#for x in $clono_files
+ "$x.clfile"
+ "$x.clfile.name"
+#end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This tool computes the public (V-gene, CDR3) Clonotypes among a number of patients along with their frequencies for each patient.
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_exact_pairing.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/top10_CDR3_exact_pairing.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr 18 09:48:00 2016
+
+@author: chmaramis
+"""
+
+import pandas as pd
+import numpy as np
+import sys
+
+
+if __name__ == "__main__":
+
+ clonosFN = sys.argv[1]
+ outFN = sys.argv[2]
+
+ Cl = pd.read_csv(clonosFN,sep='\t',index_col=0)
+ T10 = Cl[:10].copy()
+
+ aa_junction = np.array(T10['AA JUNCTION'])
+ geneCol = [x for x in T10.columns if x.upper().endswith('GENE')][0]
+
+ vG_unique = np.unique(Cl[geneCol])
+ noVG = len(vG_unique)
+
+ F = np.zeros((noVG,20))
+
+ for i in range(0,10):
+ taa = T10['AA JUNCTION'][i+1]
+ sameAA = np.where(Cl['AA JUNCTION'] == taa)[0]+1
+ if Cl[geneCol][sameAA[0]] != T10[geneCol][i+1]:
+ print('We have a problem here!')
+
+ # Make original Gene -1
+ #orGene = Cl['V-GENE'][sameAA[0]]
+ #orGeneUn = np.where(vG_unique == orGene)[0][0]
+ #F[orGeneUn,i] = -1
+
+ # Other Genes
+ for j in range(0,len(sameAA)):
+ othGene = Cl[geneCol][sameAA[j]]
+ othRead = Cl['Reads'][sameAA[j]]
+ othFreq = Cl['Frequency %'][sameAA[j]]
+ orGeneUn = np.where(vG_unique == othGene)[0][0]
+ F[orGeneUn,2*i] += othRead
+ F[orGeneUn,2*i+1] += othFreq
+
+
+ K = list(aa_junction+' Reads')
+ L = list(aa_junction+' Freq. %')
+ columns = [val for pair in zip(K,L) for val in pair]
+
+ D = pd.DataFrame(F,columns=columns,index=vG_unique)
+ D.to_csv(outFN,sep='\t')
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_exact_pairing.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/top10_CDR3_exact_pairing.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,17 @@
+
+Pair CDR3 of top10 clonotypes with combined genes
+
+top10_CDR3_exact_pairing.py "$clonos" "$out"
+
+
+
+
+
+
+
+
+
+Lists the genes with which the AA Junctions (CDR3) of the Top-10 clonotypes are combined. Works for both V-Gene and J-Gene clonotypes.
+
+
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_inexact_pairing.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/top10_CDR3_inexact_pairing.py Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr 18 11:37:40 2016
+
+@author: chmaramis
+"""
+
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr 18 09:48:00 2016
+
+@author: chmaramis
+"""
+
+import pandas as pd
+import numpy as np
+import sys
+import functools as ft
+
+def maxHam1(s1, s2):
+ if len(s1) != len(s2):
+ return False
+ else:
+ return sum(c1 != c2 for c1, c2 in zip(s1, s2)) <= 1
+
+
+if __name__ == "__main__":
+
+ clonosFN = sys.argv[1]
+ outFN = sys.argv[2]
+
+ Cl = pd.read_csv(clonosFN,sep='\t',index_col=0)
+ T10 = Cl[:10].copy()
+
+ aa_junction = np.array(T10['AA JUNCTION'])
+ geneCol = [x for x in T10.columns if x.upper().endswith('GENE')][0]
+
+ F = np.zeros((2,20))
+
+ for i in range(0,10):
+ taa = T10['AA JUNCTION'][i+1]
+ gene = T10[geneCol][i+1]
+ S1 = Cl['AA JUNCTION'].apply(ft.partial(maxHam1, s2=taa))
+ S2 = Cl[geneCol] == gene
+ S1[i+1] = False
+ F[0,2*i] = (S1 & S2).sum()
+ F[0,2*i+1] = Cl['Frequency %'][S1 & S2].sum()
+ F[1,2*i] = (S1 & ~S2).sum()
+ F[1,2*i+1] = Cl['Frequency %'][S1 & ~S2].sum()
+
+
+ K = list(aa_junction+' Nr. Clonos')
+ L = list(aa_junction+' Freq. %')
+ columns = [val for pair in zip(K,L) for val in pair]
+
+ D = pd.DataFrame(F,columns=columns, index=['same gene', 'different gene'])
+ D.to_csv(outFN,sep='\t')
+
diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_inexact_pairing.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpb2016/top10_CDR3_inexact_pairing.xml Sun Mar 18 05:54:20 2018 -0400
@@ -0,0 +1,17 @@
+
+Pair CDR3 of top10 clonotypes with similar CDR3
+
+top10_CDR3_inexact_pairing.py "$clonos" "$out"
+
+
+
+
+
+
+
+
+
+Provides the number of clonotypes with AA Junction (CDR3) similar to that of the top-10 clonotypes (maximum 1 AA difference). Distinguishes between clonotypes using the same gene and different gene. Works for both V-Gene and J-Gene clonotypes.
+
+
+