Galaxy |

Changeset 0:0e37e5b73273 (2018-03-30)

Next changeset 1:acaa8e8a0b88 (2018-04-30)

Commit message:
Initial commit

added:
README.rst
clonotype_computation.py
clonotype_computation.xml
data_filtering.py
data_filtering.xml
exclusive_clonotype_computation.py
exclusive_clonotype_computation.xml
gene_comparison.py
gene_comparison.xml
gene_computation.py
gene_computation.xml
public_clonotype_computation.py
public_clonotype_computation.xml

diff -r 000000000000 -r 0e37e5b73273 README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst Fri Mar 30 07:22:29 2018 -0400

@@ -0,0 +1,9 @@
+About
+-----
+
+IRProfiler is a Galaxy toolbox for immunogenetic repertoire profiling. It is made available as supplementary material for the article *IRProfiler - A Software Toolbox for High Throughput Immune Receptor Profiling*, authored by C. Maramis et al. and submitted for possible publication to `BMC Bioinformatics <https://bmcbioinformatics.biomedcentral.com>`_.
+
+Tools
+-----
+
+IRProfiler consists of 6 tools. All tools require the Pandas library (v 0.19).

diff -r 000000000000 -r 0e37e5b73273 clonotype_computation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/clonotype_computation.py Fri Mar 30 07:22:29 2018 -0400

[

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar 24 14:00:24 2018
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+clono_def = {'CDR3': ['AA JUNCTION'],
+             'VCDR3': ['V-GENE','AA JUNCTION'],
+             'JCDR3': ['J-GENE','AA JUNCTION'],
+             'VJCDR3': ['V-GENE','J-GENE','AA JUNCTION'],
+             'VDJCDR3': ['V-GENE','D-GENE','J-GENE','AA JUNCTION']}
+
+
+def clonotypeComputation(inp_name, clono, out1, t10n, fname):
+
+    clono_comps = clono_def[clono]
+
+    frame = DataFrame()
+    tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+    frame = concat([chunk for chunk in tp])
+
+
+    grouped = frame.groupby(clono_comps)
+    x=grouped.size()
+    x1=DataFrame(list(x.index), columns=clono_comps)
+    x1['Reads']=x.values
+    total = sum(x1['Reads'])
+    #x1['Reads/Total'] = ['{r}/{l}'.format(r=pr , l = total) for pr in x1['Reads']]
+    x1['Reads/Total'] = x1['Reads'].map(ft.partial(frm, y=total))
+    x1['Frequency %'] = (100*x1['Reads']/total).map('{:.4f}'.format)
+
+    final = x1.sort_values(by = ['Reads'] , ascending = False)
+
+    final.index=range(1,len(final)+1)
+    final.to_csv(out1 , sep = '\t')
+
+    numofclono = len(final)
+    clust = len(final[final['Reads'] > 1])
+    sing = len (final[final['Reads'] == 1])
+    top10 = final[clono_comps + ['Frequency %']].head(10)
+    top10.to_csv(t10n , sep = '\t')
+
+    summary = [[clono]]
+    summary.append([', '.join([top10[c].values[0] for c in clono_comps])])
+    summary.append([top10['Frequency %'].values[0]])
+    summary.append([numofclono])
+    summary.append([clust,'{:.4f}'.format(100*clust/numofclono)])
+    summary.append([sing,'{:.4f}'.format(100*sing/numofclono)])
+
+    ind = ['Clonotype Definition', 'Dominant Clonotype', 'Frequency', 'Number of Clonotypes' , 'Expanding Clonotypes','Singletons']
+    spl = fname.split('_')
+    col = [spl[0],'%']
+
+    frsum = DataFrame(summary,index = ind, columns = col)
+
+    return frsum
+
+
+if __name__ == '__main__':
+
+    start=time.time()
+
+    # Parse input arguments
+    inp_name = sys.argv[1]
+    clono = sys.argv[2]
+    out1 = sys.argv[3]
+    t10n = sys.argv[4]
+    sname = sys.argv[5]
+    fname = sys.argv[6]
+
+    # Execute basic function
+    frsum = clonotypeComputation(inp_name, clono, out1, t10n, fname)
+
+    # Save output to CSV files
+    if not frsum.empty:
+        frsum.to_csv(sname, sep = '\t')
+
+    # Print execution time
+    stop=time.time()
+    print('Runtime:' + str(stop-start))
\ No newline at end of file

diff -r 000000000000 -r 0e37e5b73273 clonotype_computation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/clonotype_computation.xml Fri Mar 30 07:22:29 2018 -0400

@@ -0,0 +1,25 @@
+<tool id="clonoComput" name="Clonotype Diversity & Expression" version="0.9">
+    <description>Compute clonotype diversity and expression from filtered file</description>
+    <requirements>
+      <requirement type="package" version="0.19">pandas</requirement>
+    </requirements>
+    <command interpreter="python">clonotype_computation.py $input $clonotype $clonos_file $top10_file $summary_file ${input.name}</command>
+    <inputs>
+        <param name="clonotype" type="select" label="Clonotype definition">
+            <option value="CDR3">CDR3</option>
+            <option value="VCDR3">V+CDR3</option>
+            <option value="JCDR3">J+CDR3</option>
+            <option value="VJCDR3">V+J+CDR3</option>
+            <option value="VDJCDR3">V+D+J+CDR3</option>
+        </param>
+        <param format="tabular" name="input" type="data" label="Filtered-in File" />
+    </inputs>
+    <outputs>
+        <data name="clonos_file" format="tabular" label="${input.name}_clonotypesAll" />
+        <data name="top10_file" format="tabular" label="${input.name}_clonotypesTop10" />
+        <data name="summary_file" format="tabular" label="${input.name}_clonotypesSummary" />
+    </outputs>
+    <help>
+Coming soon
+  </help>
+</tool>

diff -r 000000000000 -r 0e37e5b73273 data_filtering.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_filtering.py Fri Mar 30 07:22:29 2018 -0400

[

b'@@ -0,0 +1,449 @@\n+# -*- coding: utf-8 -*-\r\n+"""\r\n+Created on Wed Sep 4 18:41:42 2013\r\n+\r\n+@author: chmaramis\r\n+"""\r\n+\r\n+from __future__ import division\r\n+import string as strpy\r\n+import numpy as np\r\n+from pandas import *\r\n+from numpy import nan as NA\r\n+import time\r\n+import sys\r\n+\r\n+\r\n+def filter_condition_AAjunction(x):\r\n+ x= x.strip()\r\n+ if \' \' in x:\r\n+ return x.split(\' \')[0]\r\n+ else:\r\n+ return x\r\n+\r\n+#-----------frame creation---------------------\r\n+def dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname):\r\n+ \r\n+ try:\r\n+ path=inp\r\n+ frame = DataFrame()\r\n+ seqlen = []\r\n+ head = []\r\n+ tp = read_csv(path, iterator=True, chunksize=5000,sep=\'\\t\', index_col=0 )\r\n+ frame = concat([chunk for chunk in tp])\r\n+ \r\n+ frcol = list(frame.columns)\r\n+ #print frcol[-1]\r\n+ if \'Unnamed\' in frcol[-1]:\r\n+ del frcol[-1]\r\n+ frame=frame[frcol]\r\n+ \r\n+ frame.index = range(1,len(frame)+1)\r\n+ \r\n+ head.append(\'Total reads of raw data\')\r\n+ seqlen.append(len(frame))\r\n+ \r\n+ #------------drop nulls-------------------- \r\n+ filtered = DataFrame()\r\n+ filtall = DataFrame()\r\n+ summ_df = DataFrame()\r\n+ filtered = frame[isnull(frame[\'AA JUNCTION\']) | isnull(frame[\'V-GENE and allele\'])]\r\n+ \r\n+ filtall = filtall.append(filtered)\r\n+ if len(filtall) > 0:\r\n+ filtall.loc[filtered.index,\'Reason\'] = "NoResults"\r\n+ frame = frame[frame[\'AA JUNCTION\'].notnull()]\r\n+ frame = frame[frame[\'V-GENE and allele\'].notnull()]\r\n+ \r\n+ head.append(\'Not Null CDR3/V\')\r\n+ head.append(\'filter out\')\r\n+ seqlen.append(len(frame))\r\n+ seqlen.append(len(filtered))\r\n+ filtered = DataFrame()\r\n+ \r\n+ if psorf.startswith(\'y\') or psorf.startswith(\'Y\'):\r\n+ \r\n+ cc0=np.array(frame[\'V-GENE and allele\'].unique())\r\n+ \r\n+ \r\n+ for x in cc0:\r\n+ x1=x.split(\'*\')\r\n+ try:\r\n+ if (x1[1].find(\'P\')>-1) or (x1[1].find(\'ORF\')>-1):\r\n+ filtered = filtered.append(frame[frame[\'V-GENE and allele\'] == x])\r\n+ frame[\'V-GENE and allele\']=frame[\'V-GENE and allele\'].replace(x,NA)\r\n+ elif x.find(\'or\')>-1:\r\n+ posa=x.count(\'or\') \r\n+ x2=x.split(\'or\')\r\n+ x4=\'\'\r\n+ genelist=[] \r\n+ for cnt in range(0, posa+1):\r\n+ x3=x2[cnt].split(\'*\')\r\n+ x3[0]=x3[0].strip()#kobei ta space\r\n+ k=x3[0].split(\' \')# holds only TRBV\r\n+ if cnt==0:\r\n+ genelist.append(k[1])\r\n+ x4+=k[1]\r\n+ elif ((str(k[1]) in genelist) == False) & (x3[1].find(\'P\')==-1):# check for P in x3\r\n+ genelist.append(k[1])\r\n+ x4+=\' or \' \r\n+ x4+=k[1]\r\n+ x3=None\r\n+ k1=None\r\n+ genelist=None \r\n+ \r\n+ frame[\'V-GENE and allele\']=frame[\'V-GENE and allele\'].replace(x,x4)\r\n+ \r\n+ else:\r\n+ s=x1[0].split(\' \')\r\n+ frame[\'V-GENE and allele\']=frame[\'V-GENE and allele\'].replace(x,s[1])\r\n+ except IndexError as e:\r\n+ print(\'V-gene is already been formed\')\r\n+ continue\r\n+ \r\n+ x=None\r\n+ x1=None\r\n+ s=None\r\n+ \r\n+ filtall = filtall.append(filtered)\r\n+ if len(filtall) > 0:\r\n+ '..b" cc1=np.array(frame['D-GENE and allele'].unique())\r\n+ for x in cc1:\r\n+ try:\r\n+ if notnull(x): \r\n+ x1=x.split('*')\r\n+ trbd=x1[0].split(' ')\r\n+ frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1])\r\n+ else:\r\n+ frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none')\r\n+ except IndexError as e:\r\n+ print('D-gene has been formed')\r\n+ \r\n+ \r\n+ x=None\r\n+ x1=None \r\n+ \r\n+ \r\n+ if Jgene != 'null':\r\n+ \r\n+ filtered = DataFrame()\r\n+ \r\n+ filtered = frame[frame['J-GENE and allele'] != Jgene]\r\n+ \r\n+ filtall = filtall.append(filtered)\r\n+ if len(filtall) > 0:\r\n+ filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene)\r\n+ \r\n+ \r\n+ frame = frame[frame['J-GENE and allele'] == Jgene]\r\n+ \r\n+ \r\n+ \r\n+ head.append('J-GENE = {} '.format(Jgene))\r\n+ head.append('filter out')\r\n+ seqlen.append(len(frame))\r\n+ seqlen.append(len(filtered))\r\n+ \r\n+ \r\n+\r\n+ if Dgene != 'null':\r\n+ \r\n+ filtered = DataFrame()\r\n+ \r\n+ filtered = frame[frame['D-GENE and allele'] != Dgene]\r\n+ \r\n+ filtall = filtall.append(filtered)\r\n+ if len(filtall) > 0:\r\n+ filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene)\r\n+ \r\n+ \r\n+ frame = frame[frame['D-GENE and allele'] == Dgene]\r\n+ \r\n+ \r\n+ \r\n+ head.append('D-GENE = {} '.format(Dgene))\r\n+ head.append('filter out')\r\n+ seqlen.append(len(frame))\r\n+ seqlen.append(len(filtered))\r\n+ \r\n+ \r\n+ head.append('Total filter out')\r\n+ head.append('Total filter in')\r\n+ seqlen.append(len(filtall))\r\n+ seqlen.append(len(frame))\r\n+ summ_df = DataFrame(index = head)\r\n+ col = fname\r\n+ \r\n+ summ_df[col] = seqlen\r\n+ frame=frame.rename(columns = {'V-GENE and allele':'V-GENE',\r\n+ 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'})\r\n+ \r\n+ \r\n+ frcol.append('Reason')\r\n+ \r\n+ filtall = filtall[frcol]\r\n+ \r\n+ #--------------out CSV--------------------------- \r\n+ frame.index = range(1,len(frame)+1)\r\n+ if not summ_df.empty:\r\n+ summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format))\r\n+ return(frame,filtall,summ_df)\r\n+ except KeyError as e:\r\n+ print('This file has no ' + str(e) + ' column')\r\n+ return(frame,filtall,summ_df)\r\n+\r\n+\r\n+if __name__ == '__main__': \r\n+\r\n+ start=time.time() \r\n+ \r\n+ # Parse input arguments\r\n+ inp = sys.argv[1]\r\n+ cells = sys.argv[2]\r\n+ psorf = sys.argv[3]\r\n+ con = sys.argv[4]\r\n+ prod = sys.argv[5]\r\n+ CF = sys.argv[6]\r\n+ Vper = float(sys.argv[7])\r\n+ Vgene = sys.argv[8]\r\n+ laa1 = sys.argv[9]\r\n+ conaa = sys.argv[10]\r\n+ filterin = sys.argv[11]\r\n+ filterout = sys.argv[12]\r\n+ Sum_table = sys.argv[13]\r\n+ Jgene = sys.argv[14]\r\n+ Dgene = sys.argv[15]\r\n+ laa2 = sys.argv[16]\r\n+ fname = sys.argv[17]\r\n+ \r\n+ # Execute basic function\r\n+ fin,fout,summ = dataFiltering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname)\r\n+ \r\n+ # Save output to CSV files\r\n+ if not summ.empty:\r\n+ summ.to_csv(Sum_table, sep = '\\t')\r\n+ if not fin.empty:\r\n+ fin.to_csv(filterin , sep = '\\t')\r\n+ if not fout.empty: \r\n+ fout.to_csv(filterout, sep= '\\t')\r\n+ \r\n+ # Print execution time\r\n+ stop=time.time()\r\n+ print('Runtime:' + str(stop-start))\r\n+\r\n"

diff -r 000000000000 -r 0e37e5b73273 data_filtering.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_filtering.xml Fri Mar 30 07:22:29 2018 -0400

@@ -0,0 +1,103 @@
+<tool id="dataFilter" name="Data Filtering" version="0.9">
+    <description>Filter IMGT Summary dataset</description>
+    <requirements>
+      <requirement type="package" version="0.19">pandas</requirement>
+    </requirements>
+    <command interpreter="python">data_filtering.py $input $TCR_or_BCR $Vfun $spChar $prod $delCF $threshold $Vg.Vgid $clen.cdr3len1 $cdp.cdr3part $filterin_file $filterout_file $summary_file $Jg.Jgid $Dg.Dgid $clen.cdr3len2 $process_id
+  </command>
+    <inputs>
+        <param format="txt" name="input" type="data" label="IMGT Summary Output" />
+        <param format="txt" name="process_id" type="text" label="Process ID" />
+        <param name="TCR_or_BCR" type="select" label="T-cell or B-cell option">
+            <option value="TCR">T-cell</option>
+            <option value="BCR">B-cell</option>
+        </param>
+        <param name="Vfun" type="select" label="Only Take Into Account Fuctional V-GENE? ">
+            <option value="y">yes</option>
+            <option value="n">no</option>
+        </param>
+        <param name="spChar" type="select" label="Only Take Into Account CDR3 with no Special Characters (X,*,#)? ">
+            <option value="y">yes</option>
+            <option value="n">no</option>
+        </param>
+        <param name="prod" type="select" label="Only Take Into Account Productive Sequences? ">
+            <option value="y">yes</option>
+            <option value="n">no</option>
+        </param>
+        <param name="delCF" type="select" label="Only Take Into Account CDR3 with valid start/end landmarks? ">
+            <option value="y">yes</option>
+            <option value="n">no</option>
+        </param>
+        <param name="threshold" type="float" size="3" value="0" min="0" max="100" label="V-REGION identity %" />
+        <conditional name="Vg">
+            <param name="Vg_select" type="select" label="Select Specific V gene?">
+                <option value="y">Yes</option>
+                <option value="n" selected="true">No</option>
+            </param>
+            <when value="y">
+                <param format="txt" name="Vgid" type="text" label="Type V gene" />
+            </when>
+            <when value="n">
+                <param name="Vgid" type="hidden" value="null" />
+            </when>
+        </conditional>
+        <conditional name="Jg">
+            <param name="Jg_select" type="select" label="Select Specific J gene?">
+                <option value="y">Yes</option>
+                <option value="n" selected="true">No</option>
+            </param>
+            <when value="y">
+                <param format="txt" name="Jgid" type="text" label="Type J gene" />
+            </when>
+            <when value="n">
+                <param name="Jgid" type="hidden" value="null" />
+            </when>
+        </conditional>
+        <conditional name="Dg">
+            <param name="Dg_select" type="select" label="Select Specific D gene?">
+                <option value="y">Yes</option>
+                <option value="n" selected="true">No</option>
+            </param>
+            <when value="y">
+                <param format="txt" name="Dgid" type="text" label="Type D gene" />
+            </when>
+            <when value="n">
+                <param name="Dgid" type="hidden" value="null" />
+            </when>
+        </conditional>
+        <conditional name="clen">
+            <param name="clen_select" type="select" label="Select CDR3 length range?">
+                <option value="y">Yes</option>
+                <option value="n" selected="true">No</option>
+            </param>
+            <when value="y">
+                <param name="cdr3len1" type="integer" size="3" value="0" min="0" max="100" label="CDR3 Length Lower Threshold" />
+                <param name="cdr3len2" type="integer" size="3" value="0" min="0" max="100" label="CDR3 Length Upper Threshold" />
+            </when>
+            <when value="n">
+                <param name="cdr3len1" type="hidden" value="null" />
+                <param name="cdr3len2" type="hidden" value="null" />
+            </when>
+        </conditional>
+        <conditional name="cdp">
+            <param name="cdp_select" type="select" label="Only select CDR3 containing specific amino-acid sequence?">
+                <option value="y">Yes</option>
+                <option value="n" selected="true">No</option>
+            </param>
+            <when value="y">
+                <param format="txt" name="cdr3part" type="text" label="Type specific amino-acid sequence" />
+            </when>
+            <when value="n">
+                <param name="cdr3part" type="hidden" value="null" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="filterin_file" format="tabular" label="${process_id}_filterin" />
+        <data name="filterout_file" format="tabular" label="${process_id}_filterout" />
+        <data name="summary_file" format="tabular" label="${process_id}_filterSummary" />
+    </outputs>
+    <help>
+This tool filters an IMGT Summary dataset based on a combination of criteria.
+  </help>
+</tool>

diff -r 000000000000 -r 0e37e5b73273 exclusive_clonotype_computation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/exclusive_clonotype_computation.py Fri Mar 30 07:22:29 2018 -0400

[

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar 24 17:31:38 2018
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+clono_def = {'CDR3': ['AA JUNCTION'],
+             'VCDR3': ['V-GENE','AA JUNCTION'],
+             'JCDR3': ['J-GENE','AA JUNCTION']}
+
+
+def exclusiveClonotypeComputation(inputs, clono, thres):
+
+    clono_comps = clono_def[clono]
+
+    vClono=DataFrame()
+
+    # File A
+    cl = DataFrame()
+    cl = read_csv(inputs[0] , sep = '\t' , index_col = 0)
+    if (thres != 'null'):
+                cl = cl[cl['Reads'] > int(thres)]
+    vClono = cl
+
+    # File B
+    cl = DataFrame()
+    cl = read_csv(inputs[2] , sep = '\t' , index_col = 0)
+    if (thres != 'null'):
+                cl = cl[cl['Reads'] > int(thres)]
+    cl.rename(columns={'Reads':'ReadsB'}, inplace=True)
+    vClono = vClono.merge(cl[clono_comps+['ReadsB']], how='left', on=clono_comps)
+
+    vClono['ReadsB'].fillna(0, inplace=True)
+
+    vClono = vClono[vClono['ReadsB'] == 0]
+    del vClono['ReadsB']
+
+    vClono.index = range(1,len(vClono)+1)
+
+    return vClono
+
+
+if __name__ == '__main__':
+
+    start=time.time()
+
+    # Parse input arguments
+    arg = sys.argv[4:]
+    clono = sys.argv[1]
+    output = sys.argv[2]
+    threshold = sys.argv[3]
+
+    # Execute basic function
+    excl = exclusiveClonotypeComputation(arg, clono, threshold)
+
+    # Save output to CSV files
+    if not excl.empty:
+        excl.to_csv(output , sep = '\t')
+
+    # Print execution time
+    stop=time.time()
+    print('Runtime:' + str(stop-start))

diff -r 000000000000 -r 0e37e5b73273 exclusive_clonotype_computation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/exclusive_clonotype_computation.xml Fri Mar 30 07:22:29 2018 -0400

@@ -0,0 +1,40 @@
+<tool id="exclClonoComput" name="Exclusive Clonotypes" version="0.9">
+<description>Compute exclusive clonotypes between 2 repertoires</description>
+<requirements>
+      <requirement type="package" version="0.19">pandas</requirement>
+</requirements>
+<command interpreter="python">exclusive_clonotype_computation.py "$clono" "$output_file" "$Th.thres" "$inputA" "$inputA.name" "$inputB" "$inputB.name"
+</command>
+<inputs>
+        <param name="clonotype" type="select" label="Clonotype definition">
+            <option value="CDR3">CDR3</option>
+            <option value="VCDR3">V+CDR3</option>
+            <option value="JCDR3">J+CDR3</option>
+        </param>
+ <conditional name="Th">
+
+ <param name="thres_select" type="select" label="Remove CDR3 With Reads Fewer Than Threshold?">
+ <option value="y">Yes</option>
+ <option value="n" selected="true">No</option>
+ </param>
+
+ <when value="y">
+ <param name="thres" type="integer" size="4" value="1" min="1"  label="Keep CDR3 with Number of Reads more than"/>
+ </when>
+
+ <when value="n">
+ <param name="thres" type="hidden" value="null" />
+ </when>
+
+ </conditional>
+ <param format="txt" name="inputA" type="data" label="First Clonotypes File (A)"/>
+ <param format="txt" name="inputB" type="data" label="Second Clonotypes File (B)"/>
+</inputs>
+
+<outputs>
+<data format="tabular" name="output_file" label="exclusiveClonotypes"/>
+</outputs>
+<help>
+Coming soon
+</help>
+</tool>

diff -r 000000000000 -r 0e37e5b73273 gene_comparison.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_comparison.py Fri Mar 30 07:22:29 2018 -0400

[

@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar 24 17:45:09 2018
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+sw_clonos = lambda x: x.startswith('Clonotypes')
+sw_freq = lambda x: x.startswith('Freq')
+sw_gene = lambda x: x.endswith('GENE')
+
+def geneComparison(inputs):
+
+    mer=DataFrame()
+
+    for x in range(0,len(inputs),2):
+
+            ini = read_csv(inputs[x] , sep = '\t' , index_col = 0)
+
+            ini.drop(ini.columns[np.where(ini.columns.map(sw_clonos))[0]], axis=1, inplace=True)
+
+            x1 = inputs[x+1].split('_')
+            ini.rename(columns={ini.columns[np.where(ini.columns.map(sw_freq))[0][0]]: x1[0]}, inplace=True)
+
+            if mer.empty:
+                mer = DataFrame(ini)
+            else:
+                mer = merge(mer,ini, on=ini.columns[np.where(ini.columns.map(sw_gene))[0][0]] , how='outer')
+
+    mer=mer.fillna(0)
+    mer['mean'] = mer.sum(axis=1)/(len(mer.columns)-1)
+    fr = 'mean'
+
+    mer=mer.sort_values(by = fr,ascending=False)
+    mer[fr] = mer[fr].map('{:.4f}'.format)
+    mer.index = range(1,len(mer)+1)
+
+    return mer
+
+
+if __name__ == '__main__':
+
+    start=time.time()
+
+    # Parse input arguments
+    inputs = sys.argv[2:]
+    output = sys.argv[1]
+
+    # Execute basic function
+    mer = geneComparison(inputs)
+
+    # Save output to CSV files
+    if not mer.empty:
+        mer.to_csv(output , sep = '\t')
+
+    # Print execution time
+    stop=time.time()
+    print('Runtime:' + str(stop-start))

diff -r 000000000000 -r 0e37e5b73273 gene_comparison.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_comparison.xml Fri Mar 30 07:22:29 2018 -0400

@@ -0,0 +1,23 @@
+<tool id="geneCompar" name="Gene Usage Comparison" version="0.9">
+<description>Compare gene usages from multiple repertoires</description>
+<requirements>
+ <requirement type="package" version="0.19">pandas</requirement>
+</requirements>
+<command interpreter="python">gene_comparison.py "$output_file"
+#for x in $rep_files
+ "$x.rpfile"
+ "$x.rpfile.name"
+#end for
+</command>
+<inputs>
+<repeat name="rep_files" title="Patient" min="2">
+<param name="rpfile" type="data" label="File of gene usage repertoire" format="tabular"/>
+</repeat>
+</inputs>
+<outputs>
+<data format="tabular" name="output_file" label="geneUsageComparison"/>
+</outputs>
+<help>
+Coming soon
+</help>
+</tool>

diff -r 000000000000 -r 0e37e5b73273 gene_computation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_computation.py Fri Mar 30 07:22:29 2018 -0400

[

@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 20 14:58:08 2014
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+import functools as ft
+import sys
+import time
+
+frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
+
+gene_options = {'V': 'V-GENE',
+             'J': 'J-GENE'}
+
+
+def geneComputation(inp_name, gene, fname):
+
+    gene_full = gene_options[gene]
+
+    df = DataFrame()
+    df = read_csv(inp_name, sep='\t', index_col=0 )
+    #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+    #df = concat([chunk for chunk in tp])
+
+
+    vgroup = df.groupby([gene_full])
+    vdi = vgroup.size()
+    rep = DataFrame(list(vdi.index), columns=[gene_full])
+    rep['Clonotypes'] = vdi.values
+    #rep['Clonotypes/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values]
+    rep['Clonotypes/Total'] = rep['Clonotypes'].map(ft.partial(frm, y=len(df)))
+    rep['Frequency %'] = (100*rep['Clonotypes']/len(df)).map('{:.4f}'.format)
+
+    rep = rep.sort_values(by = ['Clonotypes'] , ascending = False)
+    rep.index = range(1,len(rep)+1)
+
+    su = rep[[gene_full, 'Frequency %']].head(10)
+    spl = fname.split('_')
+    summdf = DataFrame([gene_full,su[gene_full].values[0],su['Frequency %'].values[0]],
+                       index = ['Gene Family','Dominant Gene','Frequency'], columns = [spl[0]])
+    summdf['%'] = ''
+
+    return (rep, su, summdf)
+
+
+if __name__ == '__main__':
+
+    start=time.time()
+
+    # Parse input arguments
+    inp_name = sys.argv[1]
+    gene = sys.argv[2]
+    outrep = sys.argv[3]
+    summ_rep = sys.argv[4]
+    summ_rep2 = sys.argv[5]
+    fname = sys.argv[6]
+
+    # Execute basic function
+    rep, su, summdf = geneComputation(inp_name, gene, fname)
+
+    # Save output to CSV files
+    if not rep.empty:
+        rep.to_csv(outrep, sep = '\t')
+    if not su.empty:
+        su.to_csv(summ_rep, sep = '\t')
+    if not summdf.empty:
+        summdf.to_csv(summ_rep2, sep = '\t')
+
+    # Print execution time
+    stop=time.time()
+    print('Runtime:' + str(stop-start))

diff -r 000000000000 -r 0e37e5b73273 gene_computation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_computation.xml Fri Mar 30 07:22:29 2018 -0400

@@ -0,0 +1,22 @@
+<tool id="geneComput" name="Gene Usage" version="0.9">
+    <description>Compute gene usage from clonotype file</description>
+    <requirements>
+      <requirement type="package" version="0.19">pandas</requirement>
+    </requirements>
+    <command interpreter="python">gene_computation.py $input $gene $usage_file $top10_file $summary_file ${input.name}</command>
+    <inputs>
+        <param name="gene" type="select" label="Gene family">
+            <option value="V">V-Gene</option>
+            <option value="J">J-Gene</option>
+        </param>
+        <param format="tabular" name="input" type="data" label="Clonotype file" />
+    </inputs>
+    <outputs>
+        <data name="usage_file" format="tabular" label="${input.name}_geneUsageAll" />
+        <data name="top10_file" format="tabular" label="${input.name}_geneUsageTop10" />
+        <data name="summary_file" format="tabular" label="${input.name}_geneUsageSummary" />
+    </outputs>
+    <help>
+Coming soon
+  </help>
+</tool>

diff -r 000000000000 -r 0e37e5b73273 public_clonotype_computation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/public_clonotype_computation.py Fri Mar 30 07:22:29 2018 -0400

[

@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar 24 17:18:09 2018
+
+@author: chmaramis
+"""
+
+from __future__ import division
+import numpy as np
+from pandas import *
+from numpy import nan as NA
+import sys
+import time
+
+clono_def = {'CDR3': ['AA JUNCTION'],
+             'VCDR3': ['V-GENE','AA JUNCTION'],
+             'JCDR3': ['J-GENE','AA JUNCTION']}
+
+
+
+def publicClonotypeComputation(inputs, clono, thres):
+
+    clono_comps = clono_def[clono]
+
+    clono=DataFrame()
+
+    for x in range(0,len(inputs),2):
+            cl = DataFrame()
+            cl = read_csv(inputs[x] , sep = '\t' , index_col = 0)
+            #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
+            #cl = concat([chunk for chunk in tp])
+
+            if (thres != 'null'):
+                cl = cl[cl['Reads'] > int(thres)]
+
+            x1 = inputs[x+1].split('_')
+
+            del cl['Reads']
+            cl.columns = [cl.columns[0], cl.columns[1], x1[0]+' '+cl.columns[2], x1[0]+' Relative '+cl.columns[3]]
+
+            if clono.empty:
+                clono = cl
+            else:
+                clono = clono.merge(cl, how='outer', on=clono_comps)
+
+
+    col = clono.columns
+    freqs = col.map(lambda x: 'Frequency' in x)
+    reads = col.map(lambda x: 'Reads/Total' in x)
+
+    clono[col[freqs]] = clono[col[freqs]].fillna(0)
+    clono[col[reads]] = clono[col[reads]].fillna('0/*')
+
+    clono['Num of Patients']= clono[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1)
+
+    clono = clono[clono['Num of Patients'] > 1]
+
+    clono.index = range(1,len(clono)+1)
+
+    return clono
+
+
+if __name__ == '__main__':
+
+    start=time.time()
+
+    # Parse input arguments
+    arg = sys.argv[4:]
+    clono = sys.argv[1]
+    output = sys.argv[2]
+    thres = sys.argv[3]
+
+
+
+    # Execute basic function
+    mer = publicClonotypeComputation(arg, clono, thres)
+
+    # Save output to CSV files
+    if not mer.empty:
+        mer.to_csv(output , sep = '\t')
+
+    # Print execution time
+    stop=time.time()
+    print('Runtime:' + str(stop-start))

diff -r 000000000000 -r 0e37e5b73273 public_clonotype_computation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/public_clonotype_computation.xml Fri Mar 30 07:22:29 2018 -0400

@@ -0,0 +1,40 @@
+<tool id="pubClonoComput" name="Public Clonotypes" version="0.9">
+    <description>Compute public clonotypes from multiple repertoires</description>
+    <requirements>
+      <requirement type="package" version="0.19">pandas</requirement>
+    </requirements>
+    <command interpreter="python">public_clonotype_computation.py "$clonotype" "$output_file" "$Th.thres"
+#for x in $clono_files
+  "$x.clfile"
+  "$x.clfile.name"
+#end for
+</command>
+    <inputs>
+        <param name="clonotype" type="select" label="Clonotype definition">
+            <option value="CDR3">CDR3</option>
+            <option value="VCDR3">V+CDR3</option>
+            <option value="JCDR3">J+CDR3</option>
+        </param>
+        <conditional name="Th">
+            <param name="thres_select" type="select" label="Remove Clonotypes With Reads Fewer Than Threshold?">
+                <option value="y">Yes</option>
+                <option value="n" selected="true">No</option>
+            </param>
+            <when value="y">
+                <param name="thres" type="integer" size="4" value="1" min="1" label="Keep Clonotypes with Number of Reads more than" />
+            </when>
+            <when value="n">
+                <param name="thres" type="hidden" value="null" />
+            </when>
+        </conditional>
+        <repeat name="clono_files" title="Files to be append" min="2">
+            <param name="clfile" type="data" label="Clonotype_File" format="tabular" />
+        </repeat>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output_file" label="publicClonotypes" />
+    </outputs>
+    <help>
+Coming soon
+</help>
+</tool>