view cmpb2016/top10_CDR3_inexact_pairing.py @ 0:8be019b173e6 draft

Uploaded included tools
author chmaramis
date Sun, 18 Mar 2018 05:54:20 -0400
parents
children
line wrap: on
line source

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 18 11:37:40 2016

@author: chmaramis
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 18 09:48:00 2016

@author: chmaramis
"""

import pandas as pd
import numpy as np
import sys
import functools as ft

def maxHam1(s1, s2):
    if  len(s1) != len(s2):
        return False
    else: 
        return sum(c1 != c2 for c1, c2 in zip(s1, s2)) <= 1
    

if __name__ == "__main__":
    
    clonosFN = sys.argv[1]
    outFN = sys.argv[2]

    Cl = pd.read_csv(clonosFN,sep='\t',index_col=0)
    T10 = Cl[:10].copy()
    
    aa_junction = np.array(T10['AA JUNCTION'])
    geneCol = [x for x in T10.columns if x.upper().endswith('GENE')][0]
    
    F = np.zeros((2,20))
    
    for i in range(0,10):
        taa = T10['AA JUNCTION'][i+1]
        gene = T10[geneCol][i+1]
        S1 = Cl['AA JUNCTION'].apply(ft.partial(maxHam1, s2=taa))
        S2 = Cl[geneCol] == gene
        S1[i+1] = False
        F[0,2*i] = (S1 & S2).sum()
        F[0,2*i+1] = Cl['Frequency %'][S1 & S2].sum()
        F[1,2*i] = (S1 & ~S2).sum()
        F[1,2*i+1] = Cl['Frequency %'][S1 & ~S2].sum()
        
    
    K = list(aa_junction+' Nr. Clonos') 
    L = list(aa_junction+' Freq. %')
    columns = [val for pair in zip(K,L) for val in pair]
                          
    D = pd.DataFrame(F,columns=columns, index=['same gene', 'different gene'])
    D.to_csv(outFN,sep='\t')