Mercurial > repos > chmaramis > testirprofiler
comparison cmpb2016/comp_clono_JCDR3.py @ 0:8be019b173e6 draft
Uploaded included tools
| author | chmaramis |
|---|---|
| date | Sun, 18 Mar 2018 05:54:20 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8be019b173e6 |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 Created on Thu Jun 19 17:33:34 2014 | |
| 4 | |
| 5 @author: chmaramis | |
| 6 """ | |
| 7 | |
| 8 from __future__ import division | |
| 9 import numpy as np | |
| 10 from pandas import * | |
| 11 import functools as ft | |
| 12 import sys | |
| 13 import time | |
| 14 | |
| 15 frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) | |
| 16 | |
| 17 def clonotypeComputationJ(inp_name,out1,t10n,fname): | |
| 18 | |
| 19 frame = DataFrame() | |
| 20 tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) | |
| 21 frame = concat([chunk for chunk in tp]) | |
| 22 | |
| 23 grouped = frame.groupby(['J-GENE','AA JUNCTION']) | |
| 24 x=grouped.size() | |
| 25 x1=DataFrame(list(x.index), columns=['J-GENE','AA JUNCTION']) | |
| 26 x1['Reads']=x.values | |
| 27 total = sum(x1['Reads']) | |
| 28 #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']] | |
| 29 x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total)) | |
| 30 x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format) | |
| 31 | |
| 32 final = x1.sort_values(by = ['Reads'] , ascending = False) | |
| 33 | |
| 34 final.index=range(1,len(final)+1) | |
| 35 final.to_csv(out1 , sep = '\t') | |
| 36 | |
| 37 numofclono = len(final) | |
| 38 clust = len(final[final['Reads'] > 1]) | |
| 39 sing = len (final[final['Reads'] == 1]) | |
| 40 top10 = final[['J-GENE','AA JUNCTION','Frequency %']].head(10) | |
| 41 top10.to_csv(t10n , sep = '\t') | |
| 42 | |
| 43 summary = [[str(top10['J-GENE'].values[0]+','+top10['AA JUNCTION'].values[0])]] | |
| 44 summary.append([top10['Frequency %'].values[0]]) | |
| 45 summary.append([numofclono]) | |
| 46 summary.append([clust,'{:.4f}'.format(100*clust/numofclono)]) | |
| 47 summary.append([sing,'{:.4f}'.format(100*sing/numofclono)]) | |
| 48 | |
| 49 | |
| 50 ind = ['Dominant Clonotype (J+CDR3)', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes', 'Singletons'] | |
| 51 spl = fname.split('_') | |
| 52 col = [spl[0],'%'] | |
| 53 | |
| 54 frsum = DataFrame(summary,index = ind, columns = col) | |
| 55 | |
| 56 return frsum | |
| 57 | |
| 58 | |
| 59 if __name__ == '__main__': | |
| 60 | |
| 61 start=time.time() | |
| 62 | |
| 63 # Parse input arguments | |
| 64 inp_name = sys.argv[1] | |
| 65 out1 = sys.argv[2] | |
| 66 t10n = sys.argv[3] | |
| 67 sname = sys.argv[4] | |
| 68 fname = sys.argv[5] | |
| 69 | |
| 70 # Execute basic function | |
| 71 frsum = clonotypeComputationJ(inp_name,out1,t10n,fname) | |
| 72 | |
| 73 # Save output to CSV files | |
| 74 if not frsum.empty: | |
| 75 frsum.to_csv(sname, sep = '\t') | |
| 76 | |
| 77 # Print execution time | |
| 78 stop=time.time() | |
| 79 print('Runtime:' + str(stop-start)) |
