# HG changeset patch # User chmaramis # Date 1521371194 14400 # Node ID b5bb2e8e829c1e97177dcc0b4894d8b98478234e # Parent 6a8ecfdb9462715d7bae43ec2646b06e732d7302 Deleted selected files diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/comp_clono_VCDR3.py --- a/cmpb2016/comp_clono_VCDR3.py Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Jun 19 17:33:34 2014 - -@author: chmaramis -""" - -from __future__ import division -import numpy as np -from pandas import * -import functools as ft -import sys -import time - -frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) - -def clonotypeComputation(inp_name, out1, t10n, fname): - - frame = DataFrame() - tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) - frame = concat([chunk for chunk in tp]) - - - grouped = frame.groupby(['V-GENE','AA JUNCTION']) - x=grouped.size() - x1=DataFrame(list(x.index), columns=['V-GENE','AA JUNCTION']) - x1['Reads']=x.values - total = sum(x1['Reads']) - #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] - x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) - x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) - - final = x1.sort_values(by = ['Reads'] , ascending = False) - - final.index=range(1,len(final)+1) - final.to_csv(out1 , sep = '\t') - - numofclono = len(final) - clust = len(final[final['Reads'] > 1]) - sing = len (final[final['Reads'] == 1]) - top10 = final[['V-GENE','AA JUNCTION','Frequency %']].head(10) - top10.to_csv(t10n , sep = '\t') - - summary = [[str(top10['V-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] - summary.append([top10['Frequency %'].values[0]]) - summary.append([numofclono]) - summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) - summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) - - ind = ['Dominant Clonotype (V+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] - spl = fname.split('_') - col = [spl[0],'%'] - - frsum = DataFrame(summary,index = ind, columns = col) - - return frsum - - -if __name__ == '__main__': - - start=time.time() - - # Parse input arguments - inp_name = sys.argv[1] - out1 = sys.argv[2] - t10n = sys.argv[3] - sname = sys.argv[4] - fname = sys.argv[5] - - # Execute basic function - frsum = clonotypeComputation(inp_name,out1,t10n,fname) - - # Save output to CSV files - if not frsum.empty: - frsum.to_csv(sname, sep = '\t') - - # Print execution time - stop=time.time() - print('Runtime:' + str(stop-start)) diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/comp_clono_VCDR3.xml --- a/cmpb2016/comp_clono_VCDR3.xml Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ - - Compute V+CDR3 clonotypes - comp_clono_VCDR3.py $input $clonos $topcl $summ2 ${input.name} - - - - - - - - - - - - - - - - - -This tool computes the (V-gene, CDR3) clonotypes and their frequencies. - - - diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/comp_clono_VDJCDR3.py --- a/cmpb2016/comp_clono_VDJCDR3.py Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Dec 3 14:54:00 2015 - -@author: chmaramis -""" - -from __future__ import division -import numpy as np -from pandas import * -import functools as ft -import sys -import time - -frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) - -def clonotypeComputationVDJ(inp_name,out1,t10n,fname): - - frame = DataFrame() - tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) - frame = concat([chunk for chunk in tp]) - - grouped = frame.groupby(['V-GENE','D-GENE','J-GENE','AA JUNCTION']) - x=grouped.size() - x1=DataFrame(list(x.index), columns=['V-GENE','D-GENE','J-GENE','AA JUNCTION']) - x1['Reads']=x.values - total = sum(x1['Reads']) - #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] - x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) - x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) - - final = x1.sort_values(by = ['Reads'] , ascending = False) - #final = x1.sort_values(by = ['Reads'] , ascending = False) - - final.index=range(1,len(final)+1) - final.to_csv(out1 , sep = '\t') - - numofclono = len(final) - clust = len(final[final['Reads'] > 1]) - sing = len (final[final['Reads'] == 1]) - top10 = final[['V-GENE','D-GENE','J-GENE','AA JUNCTION','Frequency %']].head(10) - top10.to_csv(t10n , sep = '\t') - - summary = [[str(top10['V-GENE'].values[0]+','+top10['D-GENE'].values[0]+','+top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] - summary.append([top10['Frequency %'].values[0]]) - summary.append([numofclono]) - summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) - summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) - - - ind = ['Dominant Clonotype (V+D+J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] - spl = fname.split('_') - col = [spl[0],'%'] - - frsum = DataFrame(summary,index = ind, columns = col) - - return frsum - -if __name__ == '__main__': - - start=time.time() - - # Parse input arguments - inp_name = sys.argv[1] - out1 = sys.argv[2] - t10n = sys.argv[3] - sname = sys.argv[4] - fname = sys.argv[5] - - # Execute basic function - frsum = clonotypeComputationVDJ(inp_name,out1,t10n,fname) - - # Save output to CSV files - if not frsum.empty: - frsum.to_csv(sname, sep = '\t') - - # Print execution time - stop=time.time() - print('Runtime:' + str(stop-start)) diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/comp_clono_VDJCDR3.xml --- a/cmpb2016/comp_clono_VDJCDR3.xml Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ - - Compute V+D+J+CDR3 clonotypes - comp_clono_VDJCDR3.py $input $clonos $topcl $summ2 ${input.name} - - - - - - - - - - - - - - - - - -This tool computes the (V-gene, D-gene, J-gene, CDR3) clonotypes and their frequencies. - - - diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/comp_clono_VJCDR3.py --- a/cmpb2016/comp_clono_VJCDR3.py Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Oct 23 17:33:34 2014 - -@author: chmaramis -""" - -from __future__ import division -import numpy as np -from pandas import * -import functools as ft -import sys -import time - -frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) - -def clonotypeComputationVJ(inp_name,out1,t10n,fname): - - frame = DataFrame() - tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) - frame = concat([chunk for chunk in tp]) - - grouped = frame.groupby(['V-GENE','J-GENE','AA JUNCTION']) - x=grouped.size() - x1=DataFrame(list(x.index), columns=['V-GENE','J-GENE','AA JUNCTION']) - x1['Reads']=x.values - total = sum(x1['Reads']) - #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] - x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) - x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) - - final = x1.sort_values(by = ['Reads'] , ascending = False) - #final = x1.sort_values(by = ['Reads'] , ascending = False) - - final.index= range(1,len(final)+1) - final.to_csv(out1 , sep = '\t') - - numofclono = len(final) - clust = len(final[final['Reads'] > 1]) - sing = len (final[final['Reads'] == 1]) - top10 = final[['V-GENE','J-GENE','AA JUNCTION','Frequency %']].head(10) - top10.to_csv(t10n , sep = '\t') - - summary = [[str(top10['V-GENE'].values[0]+','+top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] - summary.append([top10['Frequency %'].values[0]]) - summary.append([numofclono]) - summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) - summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) - - - ind = ['Dominant Clonotype (V+J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] - spl = fname.split('_') - col = [spl[0],'%'] - - frsum = DataFrame(summary,index = ind, columns = col) - - return frsum - -if __name__ == '__main__': - - start=time.time() - - # Parse input arguments - inp_name = sys.argv[1] - out1 = sys.argv[2] - t10n = sys.argv[3] - sname = sys.argv[4] - fname = sys.argv[5] - - # Execute basic function - frsum = clonotypeComputationVJ(inp_name,out1,t10n,fname) - - # Save output to CSV files - if not frsum.empty: - frsum.to_csv(sname, sep = '\t') - - # Print execution time - stop=time.time() - print('Runtime:' + str(stop-start)) diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/comp_clono_VJCDR3.xml --- a/cmpb2016/comp_clono_VJCDR3.xml Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ - - Compute V+J+CDR3 clonotypes - comp_clono_VJCDR3.py $input $clonos $topcl $summ2 ${input.name} - - - - - - - - - - - - - - - - - -This tool computes the (V-gene, J-gene, CDR3) clonotypes and their frequencies. - - - diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/compare_repertoire_J.py --- a/cmpb2016/compare_repertoire_J.py Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Mon Feb 29 10:18:39 2016 - -@author: chmaramis -""" - -from __future__ import division -import numpy as np -from pandas import * -from numpy import nan as NA -import sys -import time - -sw_reads = lambda x: x.startswith('Reads') -sw_freq = lambda x: x.startswith('Freq') -sw_gene = lambda x: x.startswith('J') - -def freqtoall(inputs): - - mer=DataFrame() - - for x in range(0,len(inputs),2): - - ini = read_csv(inputs[x] , sep = '\t' , index_col = 0) - - ini.drop(ini.columns[np.where(ini.columns.map(sw_reads))[0]], axis=1, inplace=True) - - x1 = inputs[x+1].split('_') - ini.rename(columns={ini.columns[np.where(ini.columns.map(sw_freq))[0][0]]: x1[0]}, inplace=True) - - if mer.empty: - mer = DataFrame(ini) - else: - mer = merge(mer,ini, on=ini.columns[np.where(ini.columns.map(sw_gene))[0][0]] , how='outer') - - mer=mer.fillna(0) - mer['mean'] = mer.sum(axis=1)/(len(mer.columns)-1) - fr = 'mean' - - mer=mer.sort_values(by = fr,ascending=False) - mer[fr] = mer[fr].map('{:.4f}'.format) - mer.index = range(1,len(mer)+1) - - return mer - - -if __name__ == '__main__': - - start=time.time() - - # Parse input arguments - inputs = sys.argv[2:] - output = sys.argv[1] - - # Execute basic function - mer = freqtoall(inputs) - - # Save output to CSV files - if not mer.empty: - mer.to_csv(output , sep = '\t') - - # Print execution time - stop=time.time() - print('Runtime:' + str(stop-start)) diff -r 6a8ecfdb9462 -r b5bb2e8e829c cmpb2016/compare_repertoire_J.xml --- a/cmpb2016/compare_repertoire_J.xml Sun Mar 18 07:06:18 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ - -Compare J-gene repertoires - -compare_repertoire_J.py "${output1}" -#for x in $rep_files - "$x.rpfile" - "$x.rpfile.name" -#end for - - - - - - - - - - -This tool produces a union of all patients' J-gene repertoires and computes the mean frequency of each J-gene. - -