# HG changeset patch
# User chmaramis
# Date 1521371216 14400
# Node ID f4edb9437b95a2d75adf46f0dc1fedc198f72fc5
# Parent 2669fa191052c95af9862b12afcc797ea462bd12
Deleted selected files
diff -r 2669fa191052 -r f4edb9437b95 cmpb2016/ext_repertoire_V.py
--- a/cmpb2016/ext_repertoire_V.py Sun Mar 18 07:06:48 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Jun 20 14:58:08 2014
-
-@author: chmaramis
-"""
-
-from __future__ import division
-import numpy as np
-from pandas import *
-import functools as ft
-import sys
-import time
-
-frm = lambda x,y: '{r}/{l}'.format(r=x,l=y)
-
-def repertoireComputation(inp_name, fname):
-
- df = DataFrame()
- df = read_csv(inp_name, sep='\t', index_col=0 )
- #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
- #df = concat([chunk for chunk in tp])
-
-
- vgroup = df.groupby(['V-GENE'])
- vdi = vgroup.size()
- rep = DataFrame(list(vdi.index), columns=['V-GENE'])
- rep['Reads'] = vdi.values
- #rep['Reads/Total'] = ['{r}/{l}'.format(r=p , l = len(df)) for p in vdi.values]
- rep['Reads/Total'] = rep['Reads'].map(ft.partial(frm, y=len(df)))
- rep['Frequency %'] = (100*rep['Reads']/len(df)).map('{:.4f}'.format)
-
- rep = rep.sort_values(by = ['Reads'] , ascending = False)
- rep.index = range(1,len(rep)+1)
-
- su = rep[['V-GENE','Frequency %']].head(10)
- spl = fname.split('_')
- summdf = DataFrame([su['V-GENE'].values[0],su['Frequency %'].values[0]],
- index = ['Dominant V-GENE','Frequency'], columns = [spl[0]])
- summdf['%'] = ''
-
- return (rep, su, summdf)
-
-
-if __name__ == '__main__':
-
- start=time.time()
-
- # Parse input arguments
- inp_name = sys.argv[1]
- outrep = sys.argv[2]
- summ_rep = sys.argv[3]
- summ_rep2 = sys.argv[4]
- fname = sys.argv[5]
-
- # Execute basic function
- rep, su, summdf = repertoireComputation(inp_name, fname)
-
- # Save output to CSV files
- if not rep.empty:
- rep.to_csv(outrep, sep = '\t')
- if not su.empty:
- su.to_csv(summ_rep, sep = '\t')
- if not summdf.empty:
- summdf.to_csv(summ_rep2, sep = '\t')
-
- # Print execution time
- stop=time.time()
- print('Runtime:' + str(stop-start))
diff -r 2669fa191052 -r f4edb9437b95 cmpb2016/ext_repertoire_V.xml
--- a/cmpb2016/ext_repertoire_V.xml Sun Mar 18 07:06:48 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-
- Compute repertoire of V-genes
- ext_repertoire_V.py $input $clonos $topcl $summ ${input.name}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-This tool computes the repertoire of V-genes (i.e. , the number of clonotypes using each V-gene over the total number of clonotypes).
-
-
-
diff -r 2669fa191052 -r f4edb9437b95 cmpb2016/ngs_filtering.py
--- a/cmpb2016/ngs_filtering.py Sun Mar 18 07:06:48 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,452 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Sep 4 18:41:42 2013
-
-@author: chmaramis
-"""
-
-from __future__ import division
-import string as strpy
-import numpy as np
-from pandas import *
-from numpy import nan as NA
-import time
-import sys
-
-
-def filter_condition_AAjunction(x):
- x= x.strip()
- if ' ' in x:
- return x.split(' ')[0]
- else:
- return x
-
-#-----------frame creation---------------------
-def filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname):
-
- try:
- path=inp
- frame = DataFrame()
- seqlen = []
- head = []
- tp = read_csv(path, iterator=True, chunksize=5000,sep='\t', index_col=0 )
- frame = concat([chunk for chunk in tp])
-
- frcol = list(frame.columns)
- #print frcol[-1]
- if 'Unnamed' in frcol[-1]:
- del frcol[-1]
- frame=frame[frcol]
-
- frame.index = range(1,len(frame)+1)
-
- head.append('Total reads of raw data')
- seqlen.append(len(frame))
-
- #------------drop nulls--------------------
- filtered = DataFrame()
- filtall = DataFrame()
- summ_df = DataFrame()
- filtered = frame[isnull(frame['AA JUNCTION']) | isnull(frame['V-GENE and allele'])]
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = "NoResults"
- frame = frame[frame['AA JUNCTION'].notnull()]
- frame = frame[frame['V-GENE and allele'].notnull()]
-
- head.append('Not Null CDR3/V')
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
- filtered = DataFrame()
-
- if psorf.startswith('y') or psorf.startswith('Y'):
-
- cc0=np.array(frame['V-GENE and allele'].unique())
-
-
- for x in cc0:
- x1=x.split('*')
- try:
- if (x1[1].find('P')>-1) or (x1[1].find('ORF')>-1):
- filtered = filtered.append(frame[frame['V-GENE and allele'] == x])
- frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,NA)
- elif x.find('or')>-1:
- posa=x.count('or')
- x2=x.split('or')
- x4=''
- genelist=[]
- for cnt in range(0, posa+1):
- x3=x2[cnt].split('*')
- x3[0]=x3[0].strip()#kobei ta space
- k=x3[0].split(' ')# holds only TRBV
- if cnt==0:
- genelist.append(k[1])
- x4+=k[1]
- elif ((str(k[1]) in genelist) == False) & (x3[1].find('P')==-1):# check for P in x3
- genelist.append(k[1])
- x4+=' or '
- x4+=k[1]
- x3=None
- k1=None
- genelist=None
-
- frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,x4)
-
- else:
- s=x1[0].split(' ')
- frame['V-GENE and allele']=frame['V-GENE and allele'].replace(x,s[1])
- except IndexError as e:
- print('V-gene is already been formed')
- continue
-
- x=None
- x1=None
- s=None
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'P or ORF'
- frame = frame[frame['V-GENE and allele'].notnull()]
-
- head.append('Functional TRBV')
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
- filtered = DataFrame()
-
-
-
- #------------FILTERING for data quality--------------------
- if con.startswith('y') or con.startswith('Y'):
- filtered = frame [frame['AA JUNCTION'].str.contains('X') |
- frame['AA JUNCTION'].str.contains('#') |
- frame['AA JUNCTION'].str.contains('[*]')]
-
-
-
- frame = frame [~frame['AA JUNCTION'].str.contains('X') &
- ~frame['AA JUNCTION'].str.contains('#') &
- ~frame['AA JUNCTION'].str.contains('[*]') ]
-
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'X,#,*'
- head.append('Not Containing X,#,*')
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
- filtered = DataFrame()
-
- # Set label of functionality column, taking into account current & past IMGT Summary column label
- functionality_label = 'Functionality'
- if 'V-DOMAIN Functionality' in frame.columns:
- functionality_label = 'V-DOMAIN Functionality'
-
- if prod.startswith('y') or prod.startswith('Y'):
- filtered = frame[~frame[functionality_label].str.startswith('productive')]
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'not productive'
-
-
- frame=frame[frame[functionality_label].str.startswith('productive')]
-
- head.append('Productive')
- head.append('filter out')
- seqlen.append(len(frame))
-
- seqlen.append(len(filtered))
-
-
- frame['AA JUNCTION'] = frame['AA JUNCTION'].map(filter_condition_AAjunction)
-
- if CF.startswith('y') or CF.startswith('Y'):
- if cells == 'TCR':
- filtered = DataFrame()
- filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
- ~frame['AA JUNCTION'].str.endswith('F')]
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'Not C..F'
-
- frame = frame[frame['AA JUNCTION'].str.startswith('C') &
- frame['AA JUNCTION'].str.endswith('F')]
-
- head.append('CDR3 landmarks C-F')
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
- filtered = DataFrame()
- elif cells == 'BCR':
- filtered = DataFrame()
- filtered = frame[~frame['AA JUNCTION'].str.startswith('C') |
- ~frame['AA JUNCTION'].str.endswith('W')]
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'Not C..W'
-
- frame = frame[frame['AA JUNCTION'].str.startswith('C') &
- frame['AA JUNCTION'].str.endswith('W')]
-
- head.append('CDR3 landmarks C-W')
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
- filtered = DataFrame()
- else:
- print('TCR or BCR type')
-
-
- filtered = DataFrame()
-
- filtered = frame[frame['V-REGION identity %'] < Vper]
-
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'identity < {iden}%'.format(iden = Vper)
-
- frame=frame[frame['V-REGION identity %']>= Vper]
- head.append('Identity >= {iden}%'.format(iden = Vper))
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
-
- head.append('Total filter out A')
- head.append('Total filter in A')
- seqlen.append(len(filtall))
- seqlen.append(len(frame))
-
- ###############################
- if Vgene != 'null':
-
- filtered = DataFrame()
-
- filtered = frame[frame['V-GENE and allele'] != Vgene]
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'V-GENE != {} '.format(Vgene)
-
-
- frame = frame[frame['V-GENE and allele'] == Vgene]
-
-
-
- head.append('V-GENE = {} '.format(Vgene))
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
-
-
-
- ###############################
- if (laa1 != 'null') or (laa2 != 'null'):
- if int(laa2) == 0:
- low = int(laa1)
- high = 100
- elif int(laa1) > int(laa2):
- low = int(laa2)
- high = int(laa1)
- else:
- low = int(laa1)
- high = int(laa2)
-
- filtered = DataFrame()
- criteria = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) < low)
- criteria2 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) > high)
- filtered = frame[criteria | criteria2]
-
- filtall = filtall.append(filtered)
- if int(laa2)==0:
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'CDR3 length not bigger than {}'.format(low)
- else:
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'CDR3 length not from {} to {}'.format(low,high)
-
- criteria3 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) >= low)
- criteria4 = frame['AA JUNCTION'].apply(lambda row: (len(row)-2) <= high)
- frame = frame[criteria3 & criteria4]
-
- if int(laa2)==0:
- head.append('CDR3 length bigger than {}'.format(low))
- else:
- head.append('CDR3 length from {} to {} '.format(low,high))
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
-
- ###############################
- if conaa != 'null':
- if conaa.islower():
- conaa = conaa.upper()
- filtered = DataFrame()
-
- filtered = frame[~frame['AA JUNCTION'].str.contains(conaa)]
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'CDR3 not containing {}'.format(conaa)
-
- frame = frame[frame['AA JUNCTION'].str.contains(conaa) ]
-
- head.append('CDR3 containing {}'.format(conaa))
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
-
-
-
-
- #####------------keep the small J gene name--------------------
- #frame['J-GENE and allele'] = frame['J-GENE and allele'].map(filter_condition_Jgene)
- cc2=np.array(frame['J-GENE and allele'].unique())
-
- for x in cc2:
- try:
- if notnull(x):
- x1=x.split('*')
- # print(x)
- # print (x1[0])
- trbj=x1[0].split(' ')
- frame['J-GENE and allele']=frame['J-GENE and allele'].replace(x,trbj[1])
- except IndexError as e:
- print('J-Gene has been formed')
-
-
-
- x=None
- x1=None
-
-
- #------------keep the small D gene name--------------------
- cc1=np.array(frame['D-GENE and allele'].unique())
- for x in cc1:
- try:
- if notnull(x):
- x1=x.split('*')
- trbd=x1[0].split(' ')
- frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,trbd[1])
- else:
- frame['D-GENE and allele']=frame['D-GENE and allele'].replace(x,'none')
- except IndexError as e:
- print('D-gene has been formed')
-
-
- x=None
- x1=None
-
-
- if Jgene != 'null':
-
- filtered = DataFrame()
-
- filtered = frame[frame['J-GENE and allele'] != Jgene]
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'J-GENE not {} '.format(Jgene)
-
-
- frame = frame[frame['J-GENE and allele'] == Jgene]
-
-
-
- head.append('J-GENE = {} '.format(Jgene))
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
-
-
-
- if Dgene != 'null':
-
- filtered = DataFrame()
-
- filtered = frame[frame['D-GENE and allele'] != Dgene]
-
- filtall = filtall.append(filtered)
- if len(filtall) > 0:
- filtall.loc[filtered.index,'Reason'] = 'D-GENE not {} '.format(Dgene)
-
-
- frame = frame[frame['D-GENE and allele'] == Dgene]
-
-
-
- head.append('D-GENE = {} '.format(Dgene))
- head.append('filter out')
- seqlen.append(len(frame))
- seqlen.append(len(filtered))
-
-
- head.append('Total filter out')
- head.append('Total filter in')
- seqlen.append(len(filtall))
- seqlen.append(len(frame))
- summ_df = DataFrame(index = head)
- col = fname
-
- summ_df[col] = seqlen
- frame=frame.rename(columns = {'V-GENE and allele':'V-GENE',
- 'J-GENE and allele':'J-GENE','D-GENE and allele':'D-GENE'})
-
-
- frcol.append('Reason')
-
- filtall = filtall[frcol]
-
- #--------------out CSV---------------------------
- frame.index = range(1,len(frame)+1)
- if not summ_df.empty:
- summ_df['%'] = (100*summ_df[summ_df.columns[0]]/summ_df[summ_df.columns[0]][summ_df.index[0]]).map(('{:.4f}'.format))
- return(frame,filtall,summ_df)
- except KeyError as e:
- print('This file has no ' + str(e) + ' column')
- return(frame,filtall,summ_df)
-
-
-if __name__ == '__main__':
-
- start=time.time()
-
- # Parse input arguments
- inp = sys.argv[1]
- cells = sys.argv[2]
- psorf = sys.argv[3]
- con = sys.argv[4]
- prod = sys.argv[5]
- CF = sys.argv[6]
- Vper = float(sys.argv[7])
- Vgene = sys.argv[8]
- laa1 = sys.argv[9]
- conaa = sys.argv[10]
- filterin = sys.argv[11]
- filterout = sys.argv[12]
- Sum_table = sys.argv[13]
- Jgene = sys.argv[14]
- Dgene = sys.argv[15]
- laa2 = sys.argv[16]
- fname = sys.argv[17]
-
- # Execute basic function
- fin,fout,summ = filtering(inp,cells,psorf,con,prod,CF,Vper,Vgene,laa1,laa2,conaa,Jgene,Dgene,fname)
-
- # Save output to CSV files
- if not summ.empty:
- summ.to_csv(Sum_table, sep = '\t')
- if not fin.empty:
- fin.to_csv(filterin , sep = '\t')
- if not fout.empty:
- fout.to_csv(filterout, sep= '\t')
-
- # Print execution time
- stop=time.time()
- print('Runtime:' + str(stop-start))
-
diff -r 2669fa191052 -r f4edb9437b95 cmpb2016/ngs_filtering.xml
--- a/cmpb2016/ngs_filtering.xml Sun Mar 18 07:06:48 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,139 +0,0 @@
-
- Filter IMGT Summary Data
- ngs_filtering.py $input $TCR_or_BCR $Vfun $spChar $prod $delCF $threshold $Vg.Vgid $clen.cdr3len1 $cdp.cdr3part $filtin $filtout $summ $Jg.Jgid $Dg.Dgid $clen.cdr3len2 $process_id
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-This tool filters IMGT Summary Data based on a combination of criteria.
-
-
-
diff -r 2669fa191052 -r f4edb9437b95 cmpb2016/pub_clono_CDR3.py
--- a/cmpb2016/pub_clono_CDR3.py Sun Mar 18 07:06:48 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Dec 21 18:26:01 2015
-
-@author: chmaramis
-"""
-
-from __future__ import division
-import numpy as np
-from pandas import *
-from numpy import nan as NA
-import sys
-import time
-
-
-
-def publicCDR3Func(inputs,thres):
-
- cdr3=DataFrame()
-
- for x in range(0,len(inputs),2):
- cl = DataFrame()
- cl = read_csv(inputs[x] , sep = '\t' , index_col = 0)
- #tp = read_csv(inp_name, iterator=True, chunksize=5000,sep='\t', index_col=0 )
- #cl = concat([chunk for chunk in tp])
-
- if (thres != 'null'):
- cl = cl[cl['Reads'] > int(thres)]
-
- x1 = inputs[x+1].split('_')
-
- del cl['Reads']
- cl.columns = [cl.columns[0], x1[0]+' '+cl.columns[1], x1[0]+' Relative '+cl.columns[2]]
-
- if cdr3.empty:
- cdr3 = cl
- else:
- cdr3 = cdr3.merge(cl, how='outer', on='AA JUNCTION')
-
-
- col = cdr3.columns
- freqs = col.map(lambda x: 'Frequency' in x)
- reads = col.map(lambda x: 'Reads/Total' in x)
-
- cdr3[col[freqs]] = cdr3[col[freqs]].fillna(0)
- cdr3[col[reads]] = cdr3[col[reads]].fillna('0/*')
-
- cdr3['Num of Patients']= cdr3[col[freqs]].apply(lambda x: np.sum(x != 0), axis=1)
-
- cdr3 = cdr3[cdr3['Num of Patients'] > 1]
-
- cdr3.index = range(1,len(cdr3)+1)
-
- return cdr3
-
-
-if __name__ == '__main__':
-
- start=time.time()
-
- # Parse input arguments
- threshold = sys.argv[2]
- arg = sys.argv[3:]
- output = sys.argv[1]
-
- # Execute basic function
- mer = publicCDR3Func(arg,threshold)
-
- # Save output to CSV files
- if not mer.empty:
- mer.to_csv(output , sep = '\t')
-
- # Print execution time
- stop=time.time()
- print('Runtime:' + str(stop-start))
diff -r 2669fa191052 -r f4edb9437b95 cmpb2016/pub_clono_CDR3.xml
--- a/cmpb2016/pub_clono_CDR3.xml Sun Mar 18 07:06:48 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-
-Compute Public CDR3 Clonotypes
-
-pub_clono_CDR3.py "$output1" "$Th.thres"
-#for x in $clono_files
- "$x.clfile"
- "$x.clfile.name"
-#end for
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-This tool computes the public CDR3 clonotypes among a number of patients along with their frequencies for each patient.
-
-