Mercurial > repos > chmaramis > irprofiler
view gene_computation.py @ 1:acaa8e8a0b88 draft default tip
Uploaded test-data & added tool help
author | chmaramis |
---|---|
date | Mon, 30 Apr 2018 04:47:52 -0400 |
parents | 0e37e5b73273 |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Fri Jun 20 14:58:08 2014 @author: chmaramis """ from __future__ import division import numpy as np from pandas import * import functools as ft import sys import time frm = lambda x,y: '{r}/{l}'.format(r=x,l=y) gene_options = {'V': 'V-GENE', 'J': 'J-GENE'} def geneComputation(inp_name, gene, fname): gene_full = gene_options[gene] df = DataFrame() df = read_csv(inp_name, sep='\t', index_col=0 ) #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 ) #df = concat([chunk for chunk in tp]) vgroup = df.groupby([gene_full]) vdi = vgroup.size() rep = DataFrame(list(vdi.index), columns=[gene_full]) rep['Clonotypes'] = vdi.values #rep['Clonotypes/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values] rep['Clonotypes/Total'] = rep['Clonotypes'].map(ft.partial(frm, y=len(df))) rep['Frequency %'] = (100*rep['Clonotypes']/len(df)).map('{:.4f}'.format) rep = rep.sort_values(by = ['Clonotypes'] , ascending = False) rep.index = range(1,len(rep)+1) su = rep[[gene_full, 'Frequency %']].head(10) spl = fname.split('_') summdf = DataFrame([gene_full,su[gene_full].values[0],su['Frequency %'].values[0]], index = ['Gene Family','Dominant Gene','Frequency'], columns = [spl[0]]) summdf['%'] = '' return (rep, su, summdf) if __name__ == '__main__': start=time.time() # Parse input arguments inp_name = sys.argv[1] gene = sys.argv[2] outrep = sys.argv[3] summ_rep = sys.argv[4] summ_rep2 = sys.argv[5] fname = sys.argv[6] # Execute basic function rep, su, summdf = geneComputation(inp_name, gene, fname) # Save output to CSV files if not rep.empty: rep.to_csv(outrep, sep = '\t') if not su.empty: su.to_csv(summ_rep, sep = '\t') if not summdf.empty: summdf.to_csv(summ_rep2, sep = '\t') # Print execution time stop=time.time() print('Runtime:' + str(stop-start))