view scripts/S01b_extract_variable_prot.py @ 0:acc3674e515b draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:28:50 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python
#coding: utf-8
#Author : Eric Fontanillas (2010) - Victor Mataigne (2018)

# TODO : 
    # - Deal with missing data : do not do the sign test if missing species in a group
    # - Find a way to avoid the list_species argument

import argparse, os
from functions import dico, write_output, fill_with_NaN

def aa_properties(amino_acids_properties_file):
    """ Read the file 'amino_acids_properties' and stores its content

    Args :
        amino_acids_properties_file (String) : the file

    Return :
        aa_properties (dict) : key/amino-acid - value/list of properties
    """

    dict_aa_properties={}
    with open(amino_acids_properties_file, 'r') as f:
        f.readline() #jump headers
        for line in f.readlines():
            S1 = line.split(",")
            aa_name = S1[1]
            S2 = aa_name.split("/")
            aa_code = S2[1][:-1]

            frequencies = S1[2][:-1]
            residue_weight = S1[5]
            residue_volume = S1[6]
            partial_specific_volume = S1[7]
            hydration = S1[8]

            dict_aa_properties[aa_code] = [frequencies, residue_weight, residue_volume, partial_specific_volume, hydration]

    return(dict_aa_properties)

""" Functions for proteic format """

def all_aa_counts(seq):
    """ Count the occurrences of all amino-acids in a sequence

    Args:
        seq (String) : a proteic sequence

    Returns: a dictionary with amino-acids counts
    """

    aa_counts = {}
    seqU = seq.upper()
    LAA =['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G']

    for aa in LAA:
        aa_counts[aa] = seqU.count(aa)

    return aa_counts

def all_aa_props(seq_counts):
    """ Converts a dictionnary of counts into a dictionnary of proportions

    Args:
        seq_counts (dict) : dictionnary computed by the function all_aa_counts()

    Returns: a dictionary with counts replaced by proportions
    """

    aa_props = {}
    for key in seq_counts.keys():
        aa_props[key] = float(seq_counts[key]) / sum(seq_counts.values())
    return aa_props

def aa_variables_counts_and_props(aa_counts):
    """ Computes several thermostability indices (summed occurrences of some AAs, and then various ratios)

    Args:
        aa_counts (dict) : dictionnary computed by the function all_aa_counts()

    Returns: 
        aa_variables_counts : a dictionary with indices values
        aa_variables_props : a dictionary with indices proportions (ratios excluded)
    """

    # Hyperthermophile Prokaryotes criterias
    #   IVYWREL : positivelly correlated with otpimal growth
    #   ERK : (i.e. ) => positivelly correlated with optimal growth temperature
    #     ERK/DNQTSHA (or DNQTSH ??)
    #     EK/QH

    # Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP
    #      The mutational bias model predict a linear relationship between GARP vs FYMINK 
    #      ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by something else (selection ?)
    

    # Hydophobicity hypothesis [should INCREASE with thermal adaptation]
    #   AL
    #   Only non-aromatic : AVLIM
    #   Only aromatic : FYW

    # Charged hypothesis => positivelly correlated with optimal growth temperature
    #   All charged : RHKDE
    #   Only positive : RHK
    #   Only negative : DE

    # Neutral polar hypothesis  [should DECREASE with thermal adaptation]
    #   STNQ

    # Fontanillas' criteria
    #   PAYRE
    #   MVGDS
    #   PAYRE/MVGDS

    # Jollivet's criteria
    #   AC
    #   MVGDS
    #   AC/MVGDS

    aa_variables_counts = {}
    aa_variables_props = {}    
    len_seq = sum(aa_counts.values()) # length of the sequence
    
    # counts of variables
    aa_variables_counts['AC'] = aa_counts['A'] + aa_counts['C']
    aa_variables_counts['APGC'] = aa_counts['A'] + aa_counts['P'] + aa_counts['G'] + aa_counts['C']
    aa_variables_counts['AVLIM'] = aa_counts['A'] + aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M']
    aa_variables_counts['AVLIMFYW'] = aa_variables_counts['AVLIM'] + aa_counts['F'] + aa_counts['Y'] + aa_counts['W']
    aa_variables_counts['DE'] = aa_counts['D'] + aa_counts['E']
    aa_variables_counts['DNQTSHA'] = aa_counts['D'] + aa_counts['N'] + aa_counts['Q'] + aa_counts['T'] + aa_counts['S'] + aa_counts['H'] + aa_counts['A']
    aa_variables_counts['EK'] = aa_counts['E'] + aa_counts['K']
    aa_variables_counts['ERK'] = aa_counts['E'] + aa_counts['K'] + aa_counts['K']
    aa_variables_counts['FYMINK'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['M'] + aa_counts['I'] + aa_counts['N'] + aa_counts['K']
    aa_variables_counts['FYW'] = aa_counts['F'] + aa_counts['Y'] + aa_counts['W']
    aa_variables_counts['GARP'] = aa_counts['G'] + aa_counts['A'] + aa_counts['R'] + aa_counts['P']
    aa_variables_counts['IVYWREL'] = aa_counts['I'] + aa_counts['V'] + aa_counts['Y'] + aa_counts['W'] + aa_counts['R'] + aa_counts['E'] + aa_counts['L']
    aa_variables_counts['QH'] = aa_counts['Q'] + aa_counts['H']
    aa_variables_counts['RHK'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K']
    aa_variables_counts['RHKDE'] = aa_counts['R'] + aa_counts['H'] + aa_counts['K'] + aa_counts['D'] + aa_counts['E']
    aa_variables_counts['STNQ'] = aa_counts['S'] + aa_counts['T'] + aa_counts['N'] + aa_counts['Q']
    aa_variables_counts['VLIM'] = aa_counts['V'] + aa_counts['L'] + aa_counts['I'] + aa_counts['M']
    aa_variables_counts['PAYRE'] = aa_counts['P'] + aa_counts['A'] + aa_counts['Y'] + aa_counts['R'] + aa_counts['E']
    aa_variables_counts['MVGDS'] = aa_counts['M'] + aa_counts['V'] + aa_counts['G'] + aa_counts['D'] + aa_counts['S']

    # compute proportions
    for key in aa_variables_counts.keys():
        aa_variables_props[key] = float(aa_variables_counts[key]) / float(len_seq)

    if aa_variables_counts['DNQTSHA'] != 0:
        ratio_ERK_DNQTSHA = float(aa_variables_counts['ERK'])/float(aa_variables_counts['DNQTSHA'])
    else :
        ratio_ERK_DNQTSHA = -1

    if aa_variables_counts['QH'] != 0:
        ratio_EK_QH = float(aa_variables_counts['EK'])/float(aa_variables_counts['QH'])
    else :
        ratio_EK_QH = -1
    
    if aa_variables_counts['FYMINK'] != 0:
        ratio_GARP_FYMINK = float(aa_variables_counts['EK'])/float(aa_variables_counts['FYMINK'])
    else :
        ratio_GARP_FYMINK = -1
    
    if aa_variables_counts['VLIM'] != 0:
        ratio_AC_VLIM = float(aa_variables_counts['AC'])/float(aa_variables_counts['VLIM'])
        ratio_APGC_VLIM =  float(aa_variables_counts['APGC'])/float(aa_variables_counts['VLIM'])
    else :
        ratio_AC_VLIM = -1
        ratio_APGC_VLIM = -1

    if aa_variables_counts['MVGDS'] != 0:
        ratio_PAYRE_MVGDS = float(aa_variables_counts['PAYRE'])/float(aa_variables_counts['MVGDS'])
    else :
        ratio_PAYRE_MVGDS = -1

    if aa_variables_counts['MVGDS'] != 0:
        ratio_AC_MVGDS = float(aa_variables_counts['AC'])/float(aa_variables_counts['MVGDS'])
    else :
        ratio_AC_MVGDS = -1    

    aa_variables_counts['ratio_ERK_DNQTSHA'] = ratio_ERK_DNQTSHA
    aa_variables_counts['ratio_EK_QH'] = ratio_EK_QH
    aa_variables_counts['ratio_GARP_FYMINK'] = ratio_GARP_FYMINK
    aa_variables_counts['ratio_AC_VLIM'] = ratio_AC_VLIM
    aa_variables_counts['ratio_APGC_VLIM'] = ratio_APGC_VLIM
    aa_variables_counts['ratio_PAYRE_MVGDS'] = ratio_PAYRE_MVGDS
    aa_variables_counts['ratio_AC_MVGDS'] = ratio_AC_MVGDS

    return aa_variables_counts, aa_variables_props

def sequence_properties_from_aa_properties(aa_counts, aa_properties):
    """ Computes a sequence properties (based on an external data file)

    Args:
        - aa_counts (dict) : counts of amino-acids in the sequence
        - aa_properties (dict) : key/amino-acid - value/list of properties extract from the external data file

    Returns:
        - seq_props (dict) : values of the sequence properties
    """

    LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation']
    seq_props = {}

    for i in range(1,5):
        seq_props[LS[i-1]] = 0
        for key in aa_counts.keys():            
            seq_props[LS[i-1]] += aa_counts[key] * float(aa_properties[key][i])
            
    return seq_props

""" Main """

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("species_list", help="List of species separated by commas")
    parser.add_argument("aa_properties", help="File with all amino-acids properties")
    args = parser.parse_args()

    LAA = ['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G']
    LV = ['IVYWREL','EK','ERK','DNQTSHA','QH','ratio_ERK_DNQTSHA','ratio_EK_QH','FYMINK','GARP',
          'ratio_GARP_FYMINK','AVLIM','FYW','AVLIMFYW','STNQ','RHK','DE','RHKDE','APGC','AC',
          'VLIM','ratio_AC_VLIM','ratio_APGC_VLIM']
    LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation']

    list_inputs = []

    path_inputs = '01_input_files'
    list_inputs = os.listdir(path_inputs)

    lsp = args.species_list.split(',')
    lsp = sorted(lsp)
    flsp = ''
    for el in lsp:
        flsp += el+','

    path_outputs_1 = '02_tables_per_aa'
    path_outputs_2 = '02_tables_per_aa_variable'
    os.mkdir(path_outputs_1)
    os.mkdir(path_outputs_2)

    # Init empty dicts for results
    dict_for_files_aa_counts = {} # counts
    dict_for_files_aa_props = {} # proportions
    dict_for_files_variables_counts = {}
    dict_for_files_variables_props = {}
    dict_for_files_seq_properties = {}

    aa_properties_file = aa_properties(args.aa_properties) # read the aa_properties file

    # All counts and props
    for file in list_inputs:
        # iterate over input files
        sequences = dico(file, path_inputs)
        
        # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS
        # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only
        # This causes a crash here (in the ratios function). The correction skip the whole file.
        # When CDS_Search is corrected, lines with 'skip' can be removed
        skip = False
        for key in sequences.keys():
            if all(x == '-' for x in sequences[key]):
                skip = True

        if not skip:

            aa_counts_per_seq = {}
            aa_props_per_seq = {}
            aa_variables_counts_per_seq = {}
            aa_variables_props_per_seq = {}
            seq_properties = {}

            for key in sequences.keys():
                # iterate over sequences in the file
                aa_counts_per_seq[key] = all_aa_counts(sequences[key])
                aa_props_per_seq[key] = all_aa_props(aa_counts_per_seq[key])
                aa_variables_counts_per_seq[key], aa_variables_props_per_seq[key] = aa_variables_counts_and_props(aa_counts_per_seq[key])
                seq_properties[key] = sequence_properties_from_aa_properties(aa_counts_per_seq[key], aa_properties_file)

            # Add NaN for missing species
            for key in set(lsp).difference(set(sequences.keys())):
                aa_counts_per_seq[key] = fill_with_NaN(LAA)
                aa_props_per_seq[key] = fill_with_NaN(LAA)
                aa_variables_counts_per_seq[key] = fill_with_NaN(LV)
                seq_properties[key] = fill_with_NaN(LS)

            # Add computations to final dicts
            dict_for_files_aa_counts[file] = aa_counts_per_seq
            dict_for_files_aa_props[file] = aa_props_per_seq
            dict_for_files_variables_counts[file] = aa_variables_counts_per_seq
            dict_for_files_variables_props[file] = aa_variables_props_per_seq
            dict_for_files_seq_properties[file] = seq_properties

    # Try with pandas ?
    write_output(LAA, flsp, path_outputs_1, dict_for_files_aa_counts) # one file per AA
    write_output(LV, flsp, path_outputs_2, dict_for_files_variables_counts) # one file per aa_variable
    write_output(LS, flsp, path_outputs_2, dict_for_files_seq_properties) #one file per seq properties

    # { file_name1 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, 
    #                  seq2: {'A' : 0, 'C':0, 'E':0, ...}, ...
    #                },
    # { file_name2 : { seq1: {'A' : 0, 'C':0, 'E':0, ...}, 
    #                  seq2: {'A' : 0, 'C':0, 'E':0, ...},
    #                },
    # ... }

    # { file_name1 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
    #                 seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...}, ...
    #                },
    #   file_name2 : {seq1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
    #                 seq2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
    #                },
    #   ... }

    # { file_name1 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
    #   file_name2 : {'IVYWREL' : 0, 'EK': 0, 'ERK': 0, ...},
    #   ...
    # }

if __name__ == '__main__':
    main()