# HG changeset patch # User chmaramis # Date 1521366860 14400 # Node ID 8be019b173e6b498d7113468a89be78d17bdc54b Uploaded included tools diff -r 000000000000 -r 8be019b173e6 cmpb2016/appending.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/appending.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Nov 05 14:17:24 2014 + +@author: chmaramis +""" + +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + +def appending(arg): + + appfr = DataFrame() + for path in arg: + frame = DataFrame() + tp = read_csv(path, iterator=True, chunksize=1000,sep='\t', index_col=0 ) + frame = concat([chunk for chunk in tp]) + appfr = appfr.append(frame) + appfr = appfr[frame.columns] + appfr.index = range(1,len(appfr)+1) + return appfr + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + arg=sys.argv[2:] + lastEl = sys.argv[1] + + # Execute basic function + appfr = appending(arg) + + # Save output to CSV files + if not appfr.empty: + appfr.to_csv(lastEl, sep= '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/appending.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/appending.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,21 @@ + +Concatenate IMGT Report Files + +appending.py "${output1}" +#for x in $imgt_files + "$x.igfile" +#end for + + + + + + + + + + + +This tool concatenates two or more IMGT Report Files of the same type. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_CDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_CDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 19 17:33:34 2014 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +import functools as ft +import sys +import time + +frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) + + +def cdr3Computation(inp_name, out1, t10n, fname): + + frame = DataFrame() + tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + frame = concat([chunk for chunk in tp]) + + grouped = frame.groupby(['AA JUNCTION']) + x=grouped.size() + x1=DataFrame(x.index, columns=['AA JUNCTION']) + x1['Reads']=x.values + total = sum(x1['Reads']) + #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] + x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) + x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) + + final = x1.sort_values(by = ['Reads'] , ascending = False) + + final.index=range(1,len(final)+1) + final.to_csv(out1 , sep = '\t') + + numofclono = len(final) + clust = len(final[final['Reads'] > 1]) + sing = len (final[final['Reads'] == 1]) + top10 = final[['AA JUNCTION','Frequency %']].head(10) + top10.to_csv(t10n , sep = '\t') + + summary = [[str(top10['AA JUNCTION'].values[0])]] + summary.append([top10['Frequency %'].values[0]]) + summary.append([numofclono]) + summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) + summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) + + ind = ['Dominant Clonotype (CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes','Singletons'] + spl = fname.split('_') + col = [spl[0],'%'] + + frsum = DataFrame(summary,index = ind, columns = col) + + return frsum + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp_name = sys.argv[1] + out1 = sys.argv[2] + t10n = sys.argv[3] + sname = sys.argv[4] + fname = sys.argv[5] + + # Execute basic function + frsum = cdr3Computation(inp_name,out1,t10n,fname) + + # Save output to CSV files + if not frsum.empty: + frsum.to_csv(sname, sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_CDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_CDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,19 @@ + + Compute CDR3 clonotypes + comp_clono_CDR3.py $input $clonos $topcl $summ ${input.name} + + + + + + + + + + + + +This tool computes the CDR3 clonotypes. + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_JCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_JCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 19 17:33:34 2014 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +import functools as ft +import sys +import time + +frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) + +def clonotypeComputationJ(inp_name,out1,t10n,fname): + + frame = DataFrame() + tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + frame = concat([chunk for chunk in tp]) + + grouped = frame.groupby(['J-GENE','AA JUNCTION']) + x=grouped.size() + x1=DataFrame(list(x.index), columns=['J-GENE','AA JUNCTION']) + x1['Reads']=x.values + total = sum(x1['Reads']) + #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] + x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) + x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) + + final = x1.sort_values(by = ['Reads'] , ascending = False) + + final.index=range(1,len(final)+1) + final.to_csv(out1 , sep = '\t') + + numofclono = len(final) + clust = len(final[final['Reads'] > 1]) + sing = len (final[final['Reads'] == 1]) + top10 = final[['J-GENE','AA JUNCTION','Frequency %']].head(10) + top10.to_csv(t10n , sep = '\t') + + summary = [[str(top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] + summary.append([top10['Frequency %'].values[0]]) + summary.append([numofclono]) + summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) + summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) + + + ind = ['Dominant Clonotype (J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] + spl = fname.split('_') + col = [spl[0],'%'] + + frsum = DataFrame(summary,index = ind, columns = col) + + return frsum + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp_name = sys.argv[1] + out1 = sys.argv[2] + t10n = sys.argv[3] + sname = sys.argv[4] + fname = sys.argv[5] + + # Execute basic function + frsum = clonotypeComputationJ(inp_name,out1,t10n,fname) + + # Save output to CSV files + if not frsum.empty: + frsum.to_csv(sname, sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_JCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_JCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,24 @@ + + Compute J+CDR3 clonotypes + comp_clono_JCDR3.py $input $clonos $topcl $summ2 ${input.name} + + + + + + + + + + + + + + + + + +This tool computes the (J-gene, CDR3) clonotypes and their frequencies. + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_VCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 19 17:33:34 2014 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +import functools as ft +import sys +import time + +frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) + +def clonotypeComputation(inp_name, out1, t10n, fname): + + frame = DataFrame() + tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + frame = concat([chunk for chunk in tp]) + + + grouped = frame.groupby(['V-GENE','AA JUNCTION']) + x=grouped.size() + x1=DataFrame(list(x.index), columns=['V-GENE','AA JUNCTION']) + x1['Reads']=x.values + total = sum(x1['Reads']) + #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] + x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) + x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) + + final = x1.sort_values(by = ['Reads'] , ascending = False) + + final.index=range(1,len(final)+1) + final.to_csv(out1 , sep = '\t') + + numofclono = len(final) + clust = len(final[final['Reads'] > 1]) + sing = len (final[final['Reads'] == 1]) + top10 = final[['V-GENE','AA JUNCTION','Frequency %']].head(10) + top10.to_csv(t10n , sep = '\t') + + summary = [[str(top10['V-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] + summary.append([top10['Frequency %'].values[0]]) + summary.append([numofclono]) + summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) + summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) + + ind = ['Dominant Clonotype (V+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] + spl = fname.split('_') + col = [spl[0],'%'] + + frsum = DataFrame(summary,index = ind, columns = col) + + return frsum + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp_name = sys.argv[1] + out1 = sys.argv[2] + t10n = sys.argv[3] + sname = sys.argv[4] + fname = sys.argv[5] + + # Execute basic function + frsum = clonotypeComputation(inp_name,out1,t10n,fname) + + # Save output to CSV files + if not frsum.empty: + frsum.to_csv(sname, sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_VCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,24 @@ + + Compute V+CDR3 clonotypes + comp_clono_VCDR3.py $input $clonos $topcl $summ2 ${input.name} + + + + + + + + + + + + + + + + + +This tool computes the (V-gene, CDR3) clonotypes and their frequencies. + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VDJCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_VDJCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Dec 3 14:54:00 2015 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +import functools as ft +import sys +import time + +frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) + +def clonotypeComputationVDJ(inp_name,out1,t10n,fname): + + frame = DataFrame() + tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + frame = concat([chunk for chunk in tp]) + + grouped = frame.groupby(['V-GENE','D-GENE','J-GENE','AA JUNCTION']) + x=grouped.size() + x1=DataFrame(list(x.index), columns=['V-GENE','D-GENE','J-GENE','AA JUNCTION']) + x1['Reads']=x.values + total = sum(x1['Reads']) + #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] + x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) + x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) + + final = x1.sort_values(by = ['Reads'] , ascending = False) + #final = x1.sort_values(by = ['Reads'] , ascending = False) + + final.index=range(1,len(final)+1) + final.to_csv(out1 , sep = '\t') + + numofclono = len(final) + clust = len(final[final['Reads'] > 1]) + sing = len (final[final['Reads'] == 1]) + top10 = final[['V-GENE','D-GENE','J-GENE','AA JUNCTION','Frequency %']].head(10) + top10.to_csv(t10n , sep = '\t') + + summary = [[str(top10['V-GENE'].values[0]+','+top10['D-GENE'].values[0]+','+top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] + summary.append([top10['Frequency %'].values[0]]) + summary.append([numofclono]) + summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) + summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) + + + ind = ['Dominant Clonotype (V+D+J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] + spl = fname.split('_') + col = [spl[0],'%'] + + frsum = DataFrame(summary,index = ind, columns = col) + + return frsum + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp_name = sys.argv[1] + out1 = sys.argv[2] + t10n = sys.argv[3] + sname = sys.argv[4] + fname = sys.argv[5] + + # Execute basic function + frsum = clonotypeComputationVDJ(inp_name,out1,t10n,fname) + + # Save output to CSV files + if not frsum.empty: + frsum.to_csv(sname, sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VDJCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_VDJCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,24 @@ + + Compute V+D+J+CDR3 clonotypes + comp_clono_VDJCDR3.py $input $clonos $topcl $summ2 ${input.name} + + + + + + + + + + + + + + + + + +This tool computes the (V-gene, D-gene, J-gene, CDR3) clonotypes and their frequencies. + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VJCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_VJCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Oct 23 17:33:34 2014 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +import functools as ft +import sys +import time + +frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) + +def clonotypeComputationVJ(inp_name,out1,t10n,fname): + + frame = DataFrame() + tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + frame = concat([chunk for chunk in tp]) + + grouped = frame.groupby(['V-GENE','J-GENE','AA JUNCTION']) + x=grouped.size() + x1=DataFrame(list(x.index), columns=['V-GENE','J-GENE','AA JUNCTION']) + x1['Reads']=x.values + total = sum(x1['Reads']) + #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] + x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) + x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) + + final = x1.sort_values(by = ['Reads'] , ascending = False) + #final = x1.sort_values(by = ['Reads'] , ascending = False) + + final.index= range(1,len(final)+1) + final.to_csv(out1 , sep = '\t') + + numofclono = len(final) + clust = len(final[final['Reads'] > 1]) + sing = len (final[final['Reads'] == 1]) + top10 = final[['V-GENE','J-GENE','AA JUNCTION','Frequency %']].head(10) + top10.to_csv(t10n , sep = '\t') + + summary = [[str(top10['V-GENE'].values[0]+','+top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] + summary.append([top10['Frequency %'].values[0]]) + summary.append([numofclono]) + summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) + summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) + + + ind = ['Dominant Clonotype (V+J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] + spl = fname.split('_') + col = [spl[0],'%'] + + frsum = DataFrame(summary,index = ind, columns = col) + + return frsum + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp_name = sys.argv[1] + out1 = sys.argv[2] + t10n = sys.argv[3] + sname = sys.argv[4] + fname = sys.argv[5] + + # Execute basic function + frsum = clonotypeComputationVJ(inp_name,out1,t10n,fname) + + # Save output to CSV files + if not frsum.empty: + frsum.to_csv(sname, sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/comp_clono_VJCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/comp_clono_VJCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,24 @@ + + Compute V+J+CDR3 clonotypes + comp_clono_VJCDR3.py $input $clonos $topcl $summ2 ${input.name} + + + + + + + + + + + + + + + + + +This tool computes the (V-gene, J-gene, CDR3) clonotypes and their frequencies. + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_J.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/compare_repertoire_J.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 29 10:18:39 2016 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + +sw_reads = lambda x: x.startswith('Reads') +sw_freq = lambda x: x.startswith('Freq') +sw_gene = lambda x: x.startswith('J') + +def freqtoall(inputs): + + mer=DataFrame() + + for x in range(0,len(inputs),2): + + ini = read_csv(inputs[x] , sep = '\t' , index_col = 0) + + ini.drop(ini.columns[np.where(ini.columns.map(sw_reads))[0]], axis=1, inplace=True) + + x1 = inputs[x+1].split('_') + ini.rename(columns={ini.columns[np.where(ini.columns.map(sw_freq))[0][0]]: x1[0]}, inplace=True) + + if mer.empty: + mer = DataFrame(ini) + else: + mer = merge(mer,ini, on=ini.columns[np.where(ini.columns.map(sw_gene))[0][0]] , how='outer') + + mer=mer.fillna(0) + mer['mean'] = mer.sum(axis=1)/(len(mer.columns)-1) + fr = 'mean' + + mer=mer.sort_values(by = fr,ascending=False) + mer[fr] = mer[fr].map('{:.4f}'.format) + mer.index = range(1,len(mer)+1) + + return mer + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inputs = sys.argv[2:] + output = sys.argv[1] + + # Execute basic function + mer = freqtoall(inputs) + + # Save output to CSV files + if not mer.empty: + mer.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_J.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/compare_repertoire_J.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,21 @@ + +Compare J-gene repertoires + +compare_repertoire_J.py "${output1}" +#for x in $rep_files + "$x.rpfile" + "$x.rpfile.name" +#end for + + + + + + + + + + +This tool produces a union of all patients' J-gene repertoires and computes the mean frequency of each J-gene. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_V.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/compare_repertoire_V.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Sep 16 12:50:43 2014 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + +sw_reads = lambda x: x.startswith('Reads') +sw_freq = lambda x: x.startswith('Freq') +sw_gene = lambda x: x.startswith('V') + +def freqtoall(inputs): + + mer=DataFrame() + + for x in range(0,len(inputs),2): + + ini = read_csv(inputs[x] , sep = '\t' , index_col = 0) + + ini.drop(ini.columns[np.where(ini.columns.map(sw_reads))[0]], axis=1, inplace=True) + + x1 = inputs[x+1].split('_') + ini.rename(columns={ini.columns[np.where(ini.columns.map(sw_freq))[0][0]]: x1[0]}, inplace=True) + + if mer.empty: + mer = DataFrame(ini) + else: + mer = merge(mer,ini, on=ini.columns[np.where(ini.columns.map(sw_gene))[0][0]] , how='outer') + + mer=mer.fillna(0) + mer['mean'] = mer.sum(axis=1)/(len(mer.columns)-1) + fr = 'mean' + + mer=mer.sort_values(by = fr,ascending=False) + mer[fr] = mer[fr].map('{:.4f}'.format) + mer.index = range(1,len(mer)+1) + + return mer + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inputs = sys.argv[2:] + output = sys.argv[1] + + # Execute basic function + mer = freqtoall(inputs) + + # Save output to CSV files + if not mer.empty: + mer.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/compare_repertoire_V.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/compare_repertoire_V.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,21 @@ + +Compare V-gene repertoires + +compare_repertoire_V.py "${output1}" +#for x in $rep_files + "$x.rpfile" + "$x.rpfile.name" +#end for + + + + + + + + + + +This tool produces a union of all patients' V-gene repertoires and computes the mean frequency of each V-gene. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_CDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/exclus_clono_CDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 29 11:12:09 2016 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + + + +def exclusiveCDR3Func(inputs,thres): + + cdr3=DataFrame() + + # File A + cl = DataFrame() + cl = read_csv(inputs[0] , sep = '\t' , index_col = 0) + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + cdr3 = cl + + # File B + cl = DataFrame() + cl = read_csv(inputs[2] , sep = '\t' , index_col = 0) + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + cl.rename(columns={'Reads':'ReadsB'}, inplace=True) + cdr3 = cdr3.merge(cl[['AA JUNCTION','ReadsB']], how='left', on='AA JUNCTION') + + cdr3['ReadsB'].fillna(0, inplace=True) + + cdr3 = cdr3[cdr3['ReadsB'] == 0] + del cdr3['ReadsB'] + + cdr3.index = range(1,len(cdr3)+1) + + return cdr3 + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + threshold = sys.argv[2] + arg = sys.argv[3:] + output = sys.argv[1] + + # Execute basic function + excl = exclusiveCDR3Func(arg,threshold) + + # Save output to CSV files + if not excl.empty: + excl.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_CDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/exclus_clono_CDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,33 @@ + +Compute Exclusive CDR3 Clonotypes + +exclus_clono_CDR3.py "$output1" "$Th.thres" "$inputA" "$inputA.name" "$inputB" "$inputB.name" + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool computes the exclisive CDR3 clonotypes of patient or group A that are absent from patient or group B. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_JCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/exclus_clono_JCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 29 17:06:09 2016 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + + +def exclusiveJclonoFunc(inputs,thres): + + jClono=DataFrame() + + # File A + cl = DataFrame() + cl = read_csv(inputs[0] , sep = '\t' , index_col = 0) + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + jClono = cl + + # File B + cl = DataFrame() + cl = read_csv(inputs[2] , sep = '\t' , index_col = 0) + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + cl.rename(columns={'Reads':'ReadsB'}, inplace=True) + jClono = jClono.merge(cl[['J-GENE','AA JUNCTION','ReadsB']], how='left', on=['J-GENE','AA JUNCTION']) + + jClono['ReadsB'].fillna(0, inplace=True) + + jClono = jClono[jClono['ReadsB'] == 0] + del jClono['ReadsB'] + + jClono.index = range(1,len(jClono)+1) + + return jClono + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + threshold = sys.argv[2] + arg = sys.argv[3:] + output = sys.argv[1] + + # Execute basic function + excl = exclusiveJclonoFunc(arg,threshold) + + # Save output to CSV files + if not excl.empty: + excl.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_JCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/exclus_clono_JCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,33 @@ + +Compute Exclusive J+CDR3 Clonotypes + +exclus_clono_JCDR3.py "$output1" "$Th.thres" "$inputA" "$inputA.name" "$inputB" "$inputB.name" + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool computes the exclisive (J-gene, CDR3) clonotypes of patient or group A that are absent from patient or group B. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_VCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/exclus_clono_VCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 29 16:57:12 2016 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + + +def exclusiveVclonoFunc(inputs,thres): + + vClono=DataFrame() + + # File A + cl = DataFrame() + cl = read_csv(inputs[0] , sep = '\t' , index_col = 0) + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + vClono = cl + + # File B + cl = DataFrame() + cl = read_csv(inputs[2] , sep = '\t' , index_col = 0) + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + cl.rename(columns={'Reads':'ReadsB'}, inplace=True) + vClono = vClono.merge(cl[['V-GENE','AA JUNCTION','ReadsB']], how='left', on=['V-GENE','AA JUNCTION']) + + vClono['ReadsB'].fillna(0, inplace=True) + + vClono = vClono[vClono['ReadsB'] == 0] + del vClono['ReadsB'] + + vClono.index = range(1,len(vClono)+1) + + return vClono + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + threshold = sys.argv[2] + arg = sys.argv[3:] + output = sys.argv[1] + + # Execute basic function + excl = exclusiveVclonoFunc(arg,threshold) + + # Save output to CSV files + if not excl.empty: + excl.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/exclus_clono_VCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/exclus_clono_VCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,33 @@ + +Compute Exclusive V+CDR3 Clonotypes + +exclus_clono_VCDR3.py "$output1" "$Th.thres" "$inputA" "$inputA.name" "$inputB" "$inputB.name" + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool computes the exclisive (V-gene, CDR3) clonotypes of patient or group A that are absent from patient or group B. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_J.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/ext_repertoire_J.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 20 14:58:08 2014 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +import functools as ft +import sys +import time + +frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) + +def repertoireJgComputation(inp_name, fname): + + df = DataFrame() + df = read_csv(inp_name, sep='\t', index_col=0 ) + #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + #df = concat([chunk for chunk in tp]) + + vgroup = df.groupby(['J-GENE']) + vdi = vgroup.size() + rep = DataFrame(list(vdi.index), columns=['J-GENE']) + rep['Reads'] = vdi.values + #rep['Reads/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values] + rep['Reads/Total'] = rep['Reads'].map(ft.partial(frm, y=len(df))) + rep['Frequency %'] = (100*rep['Reads']/len(df)).map('{:.4f}'.format) + + rep = rep.sort_values(by = ['Reads'] , ascending = False) + + rep.index = range(1,len(rep)+1) + + su = rep[['J-GENE','Frequency %']].head(10) + spl = fname.split('_') + summdf = DataFrame([su['J-GENE'].values[0],su['Frequency %'].values[0]], + index = ['Dominant J-GENE','Frequency'], columns = [spl[0]]) + summdf['%'] = '' + + return (rep, summdf) + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp_name = sys.argv[1] + outrep = sys.argv[2] + summ_rep2 = sys.argv[3] + fname = sys.argv[4] + + # Execute basic function + rep, summdf = repertoireJgComputation(inp_name, fname) + + # Save output to CSV files + if not rep.empty: + rep.to_csv(outrep, sep = '\t') + if not summdf.empty: + summdf.to_csv(summ_rep2, sep = '\t') + + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_J.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/ext_repertoire_J.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,24 @@ + + Compute repertoire of J-genes + ext_repertoire_J.py $input $clonos $summ ${input.name} + + + + + + + + + + + + + + + + + +This tool computes the repertoire of J-genes (i.e. , the number of clonotypes using each V-gene over the total number of clonotypes). + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_V.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/ext_repertoire_V.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 20 14:58:08 2014 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +import functools as ft +import sys +import time + +frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) + +def repertoireComputation(inp_name, fname): + + df = DataFrame() + df = read_csv(inp_name, sep='\t', index_col=0 ) + #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + #df = concat([chunk for chunk in tp]) + + + vgroup = df.groupby(['V-GENE']) + vdi = vgroup.size() + rep = DataFrame(list(vdi.index), columns=['V-GENE']) + rep['Reads'] = vdi.values + #rep['Reads/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values] + rep['Reads/Total'] = rep['Reads'].map(ft.partial(frm, y=len(df))) + rep['Frequency %'] = (100*rep['Reads']/len(df)).map('{:.4f}'.format) + + rep = rep.sort_values(by = ['Reads'] , ascending = False) + rep.index = range(1,len(rep)+1) + + su = rep[['V-GENE','Frequency %']].head(10) + spl = fname.split('_') + summdf = DataFrame([su['V-GENE'].values[0],su['Frequency %'].values[0]], + index = ['Dominant V-GENE','Frequency'], columns = [spl[0]]) + summdf['%'] = '' + + return (rep, su, summdf) + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp_name = sys.argv[1] + outrep = sys.argv[2] + summ_rep = sys.argv[3] + summ_rep2 = sys.argv[4] + fname = sys.argv[5] + + # Execute basic function + rep, su, summdf = repertoireComputation(inp_name, fname) + + # Save output to CSV files + if not rep.empty: + rep.to_csv(outrep, sep = '\t') + if not su.empty: + su.to_csv(summ_rep, sep = '\t') + if not summdf.empty: + summdf.to_csv(summ_rep2, sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/ext_repertoire_V.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/ext_repertoire_V.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,24 @@ + + Compute repertoire of V-genes + ext_repertoire_V.py $input $clonos $topcl $summ ${input.name} + + + + + + + + + + + + + + + + + +This tool computes the repertoire of V-genes (i.e. , the number of clonotypes using each V-gene over the total number of clonotypes). + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/ngs_filtering.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/ngs_filtering.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,452 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 4 18:41:42 2013 + +@author: chmaramis +""" + +from __future__ import division +import string as strpy +import numpy as np +from pandas import * +from numpy import nan as NA +import time +import sys + + +def filter_condition_AAjunction(x): + x= x.strip() + if ' ' in x: + return x.split(' ')[0] + else: + return x + +#-----------frame creation--------------------- +def filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname): + + try: + path=inp + frame = DataFrame() + seqlen = [] + head = [] + tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + frame = concat([chunk for chunk in tp]) + + frcol = list(frame.columns) + #print frcol[-1] + if 'Unnamed' in frcol[-1]: + del frcol[-1] + frame=frame[frcol] + + frame.index = range(1,len(frame)+1) + + head.append('Total reads of raw data') + seqlen.append(len(frame)) + + #------------drop nulls-------------------- + filtered = DataFrame() + filtall = DataFrame() + summ_df = DataFrame() + filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])] + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = "NoResults" + frame = frame[frame['AA JUNCTION'].notnull()] + frame = frame[frame['V-GENE and allele'].notnull()] + + head.append('Not Null CDR3/V') + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + filtered = DataFrame() + + if psorf.startswith('y') or psorf.startswith('Y'): + + cc0=np.array(frame['V-GENE and allele'].unique()) + + + for x in cc0: + x1=x.split('*') + try: + if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1): + filtered = filtered.append(frame[frame['V-GENE and allele'] == x]) + frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA) + elif x.find('or')>-1: + posa=x.count('or') + x2=x.split('or') + x4='' + genelist=[] + for cnt in range(0, posa+1): + x3=x2[cnt].split('*') + x3[0]=x3[0].strip()#kobei ta space + k=x3[0].split(' ')# holds only TRBV + if cnt==0: + genelist.append(k[1]) + x4+=k[1] + elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3 + genelist.append(k[1]) + x4+=' or ' + x4+=k[1] + x3=None + k1=None + genelist=None + + frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4) + + else: + s=x1[0].split(' ') + frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1]) + except IndexError as e: + print('V-gene is already been formed') + continue + + x=None + x1=None + s=None + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'P or ORF' + frame = frame[frame['V-GENE and allele'].notnull()] + + head.append('Functional TRBV') + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + filtered = DataFrame() + + + + #------------FILTERING for data quality-------------------- + if con.startswith('y') or con.startswith('Y'): + filtered = frame [frame['AA JUNCTION'].str.contains('X') | + frame['AA JUNCTION'].str.contains('#') | + frame['AA JUNCTION'].str.contains('[*]')] + + + + frame = frame [~frame['AA JUNCTION'].str.contains('X') & + ~frame['AA JUNCTION'].str.contains('#') & + ~frame['AA JUNCTION'].str.contains('[*]') ] + + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'X,#,*' + head.append('Not Containing X,#,*') + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + filtered = DataFrame() + + # Set label of functionality column, taking into account current & past IMGT Summary column label + functionality_label = 'Functionality' + if 'V-DOMAIN Functionality' in frame.columns: + functionality_label = 'V-DOMAIN Functionality' + + if prod.startswith('y') or prod.startswith('Y'): + filtered = frame[~frame[functionality_label].str.startswith('productive')] + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'not productive' + + + frame=frame[frame[functionality_label].str.startswith('productive')] + + head.append('Productive') + head.append('filter out') + seqlen.append(len(frame)) + + seqlen.append(len(filtered)) + + + frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction) + + if CF.startswith('y') or CF.startswith('Y'): + if cells == 'TCR': + filtered = DataFrame() + filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | + ~frame['AA JUNCTION'].str.endswith('F')] + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'Not C..F' + + frame = frame[frame['AA JUNCTION'].str.startswith('C') & + frame['AA JUNCTION'].str.endswith('F')] + + head.append('CDR3 landmarks C-F') + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + filtered = DataFrame() + elif cells == 'BCR': + filtered = DataFrame() + filtered = frame[~frame['AA JUNCTION'].str.startswith('C') | + ~frame['AA JUNCTION'].str.endswith('W')] + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'Not C..W' + + frame = frame[frame['AA JUNCTION'].str.startswith('C') & + frame['AA JUNCTION'].str.endswith('W')] + + head.append('CDR3 landmarks C-W') + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + filtered = DataFrame() + else: + print('TCR or BCR type') + + + filtered = DataFrame() + + filtered = frame[frame['V-REGION identity %'] < Vper] + + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper) + + frame=frame[frame['V-REGION identity %']>= Vper] + head.append('Identity >= {iden}%'.format(iden = Vper)) + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + + head.append('Total filter out A') + head.append('Total filter in A') + seqlen.append(len(filtall)) + seqlen.append(len(frame)) + + ############################### + if Vgene != 'null': + + filtered = DataFrame() + + filtered = frame[frame['V-GENE and allele'] != Vgene] + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene) + + + frame = frame[frame['V-GENE and allele'] == Vgene] + + + + head.append('V-GENE = {} '.format(Vgene)) + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + + + + ############################### + if (laa1 != 'null') or (laa2 != 'null'): + if int(laa2) == 0: + low = int(laa1) + high = 100 + elif int(laa1) > int(laa2): + low = int(laa2) + high = int(laa1) + else: + low = int(laa1) + high = int(laa2) + + filtered = DataFrame() + criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low) + criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high) + filtered = frame[criteria | criteria2] + + filtall = filtall.append(filtered) + if int(laa2)==0: + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low) + else: + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high) + + criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low) + criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high) + frame = frame[criteria3 & criteria4] + + if int(laa2)==0: + head.append('CDR3 length bigger than {}'.format(low)) + else: + head.append('CDR3 length from {} to {} '.format(low,high)) + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + + ############################### + if conaa != 'null': + if conaa.islower(): + conaa = conaa.upper() + filtered = DataFrame() + + filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)] + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa) + + frame = frame[frame['AA JUNCTION'].str.contains(conaa) ] + + head.append('CDR3 containing {}'.format(conaa)) + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + + + + + #####------------keep the small J gene name-------------------- + #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene) + cc2=np.array(frame['J-GENE and allele'].unique()) + + for x in cc2: + try: + if notnull(x): + x1=x.split('*') + # print(x) + # print (x1[0]) + trbj=x1[0].split(' ') + frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1]) + except IndexError as e: + print('J-Gene has been formed') + + + + x=None + x1=None + + + #------------keep the small D gene name-------------------- + cc1=np.array(frame['D-GENE and allele'].unique()) + for x in cc1: + try: + if notnull(x): + x1=x.split('*') + trbd=x1[0].split(' ') + frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1]) + else: + frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none') + except IndexError as e: + print('D-gene has been formed') + + + x=None + x1=None + + + if Jgene != 'null': + + filtered = DataFrame() + + filtered = frame[frame['J-GENE and allele'] != Jgene] + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene) + + + frame = frame[frame['J-GENE and allele'] == Jgene] + + + + head.append('J-GENE = {} '.format(Jgene)) + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + + + + if Dgene != 'null': + + filtered = DataFrame() + + filtered = frame[frame['D-GENE and allele'] != Dgene] + + filtall = filtall.append(filtered) + if len(filtall) > 0: + filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene) + + + frame = frame[frame['D-GENE and allele'] == Dgene] + + + + head.append('D-GENE = {} '.format(Dgene)) + head.append('filter out') + seqlen.append(len(frame)) + seqlen.append(len(filtered)) + + + head.append('Total filter out') + head.append('Total filter in') + seqlen.append(len(filtall)) + seqlen.append(len(frame)) + summ_df = DataFrame(index = head) + col = fname + + summ_df[col] = seqlen + frame=frame.rename(columns = {'V-GENE and allele':'V-GENE', + 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'}) + + + frcol.append('Reason') + + filtall = filtall[frcol] + + #--------------out CSV--------------------------- + frame.index = range(1,len(frame)+1) + if not summ_df.empty: + summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format)) + return(frame,filtall,summ_df) + except KeyError as e: + print('This file has no ' + str(e) + ' column') + return(frame,filtall,summ_df) + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + inp = sys.argv[1] + cells = sys.argv[2] + psorf = sys.argv[3] + con = sys.argv[4] + prod = sys.argv[5] + CF = sys.argv[6] + Vper = float(sys.argv[7]) + Vgene = sys.argv[8] + laa1 = sys.argv[9] + conaa = sys.argv[10] + filterin = sys.argv[11] + filterout = sys.argv[12] + Sum_table = sys.argv[13] + Jgene = sys.argv[14] + Dgene = sys.argv[15] + laa2 = sys.argv[16] + fname = sys.argv[17] + + # Execute basic function + fin,fout,summ = filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname) + + # Save output to CSV files + if not summ.empty: + summ.to_csv(Sum_table, sep = '\t') + if not fin.empty: + fin.to_csv(filterin , sep = '\t') + if not fout.empty: + fout.to_csv(filterout, sep= '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) + diff -r 000000000000 -r 8be019b173e6 cmpb2016/ngs_filtering.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/ngs_filtering.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,139 @@ + + Filter IMGT Summary Data + ngs_filtering.py $input $TCR_or_BCR $Vfun $spChar $prod $delCF $threshold $Vg.Vgid $clen.cdr3len1 $cdp.cdr3part $filtin $filtout $summ $Jg.Jgid $Dg.Dgid $clen.cdr3len2 $process_id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool filters IMGT Summary Data based on a combination of criteria. + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_CDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/pub_clono_CDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 21 18:26:01 2015 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + + + +def publicCDR3Func(inputs,thres): + + cdr3=DataFrame() + + for x in range(0,len(inputs),2): + cl = DataFrame() + cl = read_csv(inputs[x] , sep = '\t' , index_col = 0) + #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + #cl = concat([chunk for chunk in tp]) + + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + + x1 = inputs[x+1].split('_') + + del cl['Reads'] + cl.columns = [cl.columns[0], x1[0]+' '+cl.columns[1], x1[0]+' Relative '+cl.columns[2]] + + if cdr3.empty: + cdr3 = cl + else: + cdr3 = cdr3.merge(cl, how='outer', on='AA JUNCTION') + + + col = cdr3.columns + freqs = col.map(lambda x: 'Frequency' in x) + reads = col.map(lambda x: 'Reads/Total' in x) + + cdr3[col[freqs]] = cdr3[col[freqs]].fillna(0) + cdr3[col[reads]] = cdr3[col[reads]].fillna('0/*') + + cdr3['Num of Patients']= cdr3[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1) + + cdr3 = cdr3[cdr3['Num of Patients'] > 1] + + cdr3.index = range(1,len(cdr3)+1) + + return cdr3 + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + threshold = sys.argv[2] + arg = sys.argv[3:] + output = sys.argv[1] + + # Execute basic function + mer = publicCDR3Func(arg,threshold) + + # Save output to CSV files + if not mer.empty: + mer.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_CDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/pub_clono_CDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,39 @@ + +Compute Public CDR3 Clonotypes + +pub_clono_CDR3.py "$output1" "$Th.thres" +#for x in $clono_files + "$x.clfile" + "$x.clfile.name" +#end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool computes the public CDR3 clonotypes among a number of patients along with their frequencies for each patient. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_JCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/pub_clono_JCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 26 19:08:28 2016 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + + + +def publicJGeneClonoFunc(inputs,thres): + + clono=DataFrame() + + for x in range(0,len(inputs),2): + cl = DataFrame() + cl = read_csv(inputs[x] , sep = '\t' , index_col = 0) + #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + #cl = concat([chunk for chunk in tp]) + + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + + x1 = inputs[x+1].split('_') + + del cl['Reads'] + cl.columns = [cl.columns[0], cl.columns[1], x1[0]+' '+cl.columns[2], x1[0]+' Relative '+cl.columns[3]] + + if clono.empty: + clono = cl + else: + clono = clono.merge(cl, how='outer', on=['J-GENE','AA JUNCTION']) + + + col = clono.columns + freqs = col.map(lambda x: 'Frequency' in x) + reads = col.map(lambda x: 'Reads/Total' in x) + + clono[col[freqs]] = clono[col[freqs]].fillna(0) + clono[col[reads]] = clono[col[reads]].fillna('0/*') + + clono['Num of Patients']= clono[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1) + + clono = clono[clono['Num of Patients'] > 1] + + clono.index = range(1,len(clono)+1) + + return clono + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + arg = sys.argv[3:] + output = sys.argv[1] + thres = sys.argv[2] + + + # Execute basic function + mer = publicJGeneClonoFunc(arg,thres) + + # Save output to CSV files + if not mer.empty: + mer.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_JCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/pub_clono_JCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,39 @@ + +Compute Public J+CDR3 Clonotypes + +pub_clono_JCDR3.py "$output1" "$Th.thres" +#for x in $clono_files + "$x.clfile" + "$x.clfile.name" +#end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool computes the public (J-gene, CDR3) Clonotypes among a number of patients along with their frequencies for each patient. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_VCDR3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/pub_clono_VCDR3.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 26 18:49:15 2016 + +@author: chmaramis +""" + +from __future__ import division +import numpy as np +from pandas import * +from numpy import nan as NA +import sys +import time + + + +def publicVGeneClonoFunc(inputs,thres): + + clono=DataFrame() + + for x in range(0,len(inputs),2): + cl = DataFrame() + cl = read_csv(inputs[x] , sep = '\t' , index_col = 0) + #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) + #cl = concat([chunk for chunk in tp]) + + if (thres != 'null'): + cl = cl[cl['Reads'] > int(thres)] + + x1 = inputs[x+1].split('_') + + del cl['Reads'] + cl.columns = [cl.columns[0], cl.columns[1], x1[0]+' '+cl.columns[2], x1[0]+' Relative '+cl.columns[3]] + + if clono.empty: + clono = cl + else: + clono = clono.merge(cl, how='outer', on=['V-GENE','AA JUNCTION']) + + + col = clono.columns + freqs = col.map(lambda x: 'Frequency' in x) + reads = col.map(lambda x: 'Reads/Total' in x) + + clono[col[freqs]] = clono[col[freqs]].fillna(0) + clono[col[reads]] = clono[col[reads]].fillna('0/*') + + clono['Num of Patients']= clono[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1) + + clono = clono[clono['Num of Patients'] > 1] + + clono.index = range(1,len(clono)+1) + + return clono + + +if __name__ == '__main__': + + start=time.time() + + # Parse input arguments + arg = sys.argv[3:] + output = sys.argv[1] + thres = sys.argv[2] + + + # Execute basic function + mer = publicVGeneClonoFunc(arg,thres) + + # Save output to CSV files + if not mer.empty: + mer.to_csv(output , sep = '\t') + + # Print execution time + stop=time.time() + print('Runtime:' + str(stop-start)) diff -r 000000000000 -r 8be019b173e6 cmpb2016/pub_clono_VCDR3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/pub_clono_VCDR3.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,39 @@ + +Compute Public V+CDR3 Clonotypes + +pub_clono_VCDR3.py "$output1" "$Th.thres" +#for x in $clono_files + "$x.clfile" + "$x.clfile.name" +#end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool computes the public (V-gene, CDR3) Clonotypes among a number of patients along with their frequencies for each patient. + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_exact_pairing.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/top10_CDR3_exact_pairing.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 18 09:48:00 2016 + +@author: chmaramis +""" + +import pandas as pd +import numpy as np +import sys + + +if __name__ == "__main__": + + clonosFN = sys.argv[1] + outFN = sys.argv[2] + + Cl = pd.read_csv(clonosFN,sep='\t',index_col=0) + T10 = Cl[:10].copy() + + aa_junction = np.array(T10['AA JUNCTION']) + geneCol = [x for x in T10.columns if x.upper().endswith('GENE')][0] + + vG_unique = np.unique(Cl[geneCol]) + noVG = len(vG_unique) + + F = np.zeros((noVG,20)) + + for i in range(0,10): + taa = T10['AA JUNCTION'][i+1] + sameAA = np.where(Cl['AA JUNCTION'] == taa)[0]+1 + if Cl[geneCol][sameAA[0]] != T10[geneCol][i+1]: + print('We have a problem here!') + + # Make original Gene -1 + #orGene = Cl['V-GENE'][sameAA[0]] + #orGeneUn = np.where(vG_unique == orGene)[0][0] + #F[orGeneUn,i] = -1 + + # Other Genes + for j in range(0,len(sameAA)): + othGene = Cl[geneCol][sameAA[j]] + othRead = Cl['Reads'][sameAA[j]] + othFreq = Cl['Frequency %'][sameAA[j]] + orGeneUn = np.where(vG_unique == othGene)[0][0] + F[orGeneUn,2*i] += othRead + F[orGeneUn,2*i+1] += othFreq + + + K = list(aa_junction+' Reads') + L = list(aa_junction+' Freq. %') + columns = [val for pair in zip(K,L) for val in pair] + + D = pd.DataFrame(F,columns=columns,index=vG_unique) + D.to_csv(outFN,sep='\t') + diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_exact_pairing.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/top10_CDR3_exact_pairing.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,17 @@ + +Pair CDR3 of top10 clonotypes with combined genes + +top10_CDR3_exact_pairing.py "$clonos" "$out" + + + + + + + + + +Lists the genes with which the AA Junctions (CDR3) of the Top-10 clonotypes are combined. Works for both V-Gene and J-Gene clonotypes. + + + diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_inexact_pairing.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/top10_CDR3_inexact_pairing.py Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 18 11:37:40 2016 + +@author: chmaramis +""" + +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 18 09:48:00 2016 + +@author: chmaramis +""" + +import pandas as pd +import numpy as np +import sys +import functools as ft + +def maxHam1(s1, s2): + if len(s1) != len(s2): + return False + else: + return sum(c1 != c2 for c1, c2 in zip(s1, s2)) <= 1 + + +if __name__ == "__main__": + + clonosFN = sys.argv[1] + outFN = sys.argv[2] + + Cl = pd.read_csv(clonosFN,sep='\t',index_col=0) + T10 = Cl[:10].copy() + + aa_junction = np.array(T10['AA JUNCTION']) + geneCol = [x for x in T10.columns if x.upper().endswith('GENE')][0] + + F = np.zeros((2,20)) + + for i in range(0,10): + taa = T10['AA JUNCTION'][i+1] + gene = T10[geneCol][i+1] + S1 = Cl['AA JUNCTION'].apply(ft.partial(maxHam1, s2=taa)) + S2 = Cl[geneCol] == gene + S1[i+1] = False + F[0,2*i] = (S1 & S2).sum() + F[0,2*i+1] = Cl['Frequency %'][S1 & S2].sum() + F[1,2*i] = (S1 & ~S2).sum() + F[1,2*i+1] = Cl['Frequency %'][S1 & ~S2].sum() + + + K = list(aa_junction+' Nr. Clonos') + L = list(aa_junction+' Freq. %') + columns = [val for pair in zip(K,L) for val in pair] + + D = pd.DataFrame(F,columns=columns, index=['same gene', 'different gene']) + D.to_csv(outFN,sep='\t') + diff -r 000000000000 -r 8be019b173e6 cmpb2016/top10_CDR3_inexact_pairing.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpb2016/top10_CDR3_inexact_pairing.xml Sun Mar 18 05:54:20 2018 -0400 @@ -0,0 +1,17 @@ + +Pair CDR3 of top10 clonotypes with similar CDR3 + +top10_CDR3_inexact_pairing.py "$clonos" "$out" + + + + + + + + + +Provides the number of clonotypes with AA Junction (CDR3) similar to that of the top-10 clonotypes (maximum 1 AA difference). Distinguishes between clonotypes using the same gene and different gene. Works for both V-Gene and J-Gene clonotypes. + + +