| 0 | 1 # -*- coding: utf-8 -*- | 
|  | 2 """ | 
|  | 3 Created on Fri Jun 20 14:58:08 2014 | 
|  | 4 | 
|  | 5 @author: chmaramis | 
|  | 6 """ | 
|  | 7 | 
|  | 8 from __future__ import division | 
|  | 9 import numpy as np | 
|  | 10 from pandas import * | 
|  | 11 import functools as ft | 
|  | 12 import sys | 
|  | 13 import time | 
|  | 14 | 
|  | 15 frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) | 
|  | 16 | 
|  | 17 gene_options = {'V': 'V-GENE', | 
|  | 18              'J': 'J-GENE'} | 
|  | 19 | 
|  | 20 | 
|  | 21 def geneComputation(inp_name, gene, fname): | 
|  | 22 | 
|  | 23     gene_full = gene_options[gene] | 
|  | 24 | 
|  | 25     df = DataFrame() | 
|  | 26     df = read_csv(inp_name, sep='\t', index_col=0 ) | 
|  | 27     #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) | 
|  | 28     #df = concat([chunk for chunk in tp]) | 
|  | 29 | 
|  | 30 | 
|  | 31     vgroup = df.groupby([gene_full]) | 
|  | 32     vdi = vgroup.size() | 
|  | 33     rep = DataFrame(list(vdi.index), columns=[gene_full]) | 
|  | 34     rep['Clonotypes'] = vdi.values | 
|  | 35     #rep['Clonotypes/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values] | 
|  | 36     rep['Clonotypes/Total'] = rep['Clonotypes'].map(ft.partial(frm, y=len(df))) | 
|  | 37     rep['Frequency %'] = (100*rep['Clonotypes']/len(df)).map('{:.4f}'.format) | 
|  | 38 | 
|  | 39     rep = rep.sort_values(by = ['Clonotypes'] , ascending = False) | 
|  | 40     rep.index = range(1,len(rep)+1) | 
|  | 41 | 
|  | 42     su = rep[[gene_full, 'Frequency %']].head(10) | 
|  | 43     spl = fname.split('_') | 
|  | 44     summdf = DataFrame([gene_full,su[gene_full].values[0],su['Frequency %'].values[0]], | 
|  | 45                        index = ['Gene Family','Dominant Gene','Frequency'], columns = [spl[0]]) | 
|  | 46     summdf['%'] = '' | 
|  | 47 | 
|  | 48     return (rep, su, summdf) | 
|  | 49 | 
|  | 50 | 
|  | 51 if __name__ == '__main__': | 
|  | 52 | 
|  | 53     start=time.time() | 
|  | 54 | 
|  | 55     # Parse input arguments | 
|  | 56     inp_name = sys.argv[1] | 
|  | 57     gene = sys.argv[2] | 
|  | 58     outrep = sys.argv[3] | 
|  | 59     summ_rep = sys.argv[4] | 
|  | 60     summ_rep2 = sys.argv[5] | 
|  | 61     fname = sys.argv[6] | 
|  | 62 | 
|  | 63     # Execute basic function | 
|  | 64     rep, su, summdf = geneComputation(inp_name, gene, fname) | 
|  | 65 | 
|  | 66     # Save output to CSV files | 
|  | 67     if not rep.empty: | 
|  | 68         rep.to_csv(outrep, sep = '\t') | 
|  | 69     if not su.empty: | 
|  | 70         su.to_csv(summ_rep, sep = '\t') | 
|  | 71     if not summdf.empty: | 
|  | 72         summdf.to_csv(summ_rep2, sep = '\t') | 
|  | 73 | 
|  | 74     # Print execution time | 
|  | 75     stop=time.time() | 
|  | 76     print('Runtime:' + str(stop-start)) |