diff scripts/S01b_extract_variable_nuc.py @ 0:acc3674e515b draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:28:50 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S01b_extract_variable_nuc.py	Fri Feb 01 10:28:50 2019 -0500
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+#coding: utf-8
+
+import argparse, os
+from functions import dico, write_output, fill_with_NaN
+
+""" Functions for nucleic format """
+
+def all_nuc_counts(seq):
+    """ Counts the number of nucleotides in a sequence
+
+    Args :
+        - seq (String) : a nucleic sequence
+
+    Return:
+        - nuc_counts (dict) : counts on nucleotides (key/value : nucleotide/count)
+    """
+    nuc_counts = {}
+    seqU = seq.upper()
+    LN =['A','C','T','G']
+
+    for base in LN:
+        nuc_counts[base] = seqU.count(base)
+
+    return nuc_counts
+
+def ratios(nuc_counts):
+    """ Compute GC and purine ratios from the counts of nucleotides in a sequence
+
+    Args :
+        - nuc_counts (dict) : dictionary of nucleotides counts (output of all_nuc_counts())
+
+    Return :
+        - ratios (dict) : dictionary with ratios
+    """
+
+    # Search for compositional bias in genome as marker of thermal adaptation
+    ratios = {}
+    ratios['GC_percent'] = float(nuc_counts['C'] + nuc_counts['G'])/(sum(nuc_counts.values())*100)
+    ratios['purine_percent'] = float(nuc_counts['A'] + nuc_counts['G'])/(sum(nuc_counts.values())*100)
+
+    ratios['DIFF_GC'] = nuc_counts['G'] - nuc_counts['C']
+    ratios['DIFF_AT'] = nuc_counts['A'] - nuc_counts['T']
+
+    # Per bp
+    ratios['PLI_GC'] = float(ratios['DIFF_GC'])/sum(nuc_counts.values())
+    ratios['PLI_AT'] = float(ratios['DIFF_AT'])/sum(nuc_counts.values())
+
+    # Per 1000 bp
+    ratios['PLI_GC_1000'] = ratios['PLI_GC']*1000
+    ratios['PLI_AT_1000'] = ratios['PLI_AT']*1000
+  
+    return ratios
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("species_list", help="List of species separated by commas")
+    args = parser.parse_args()
+
+    LN =['A','C','T','G']
+    Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000']
+
+    list_inputs = []
+
+    path_inputs = '01_input_files'
+    list_inputs = os.listdir(path_inputs)
+
+    lsp = args.species_list.split(',')
+    lsp = sorted(lsp)
+    flsp = ''
+    for el in lsp:
+        flsp += el+','
+    
+    path_outputs_1 = '02_tables_per_nucleotide'
+    path_outputs_2 = '02_tables_per_nuc_variable'
+    os.mkdir(path_outputs_1)
+    os.mkdir(path_outputs_2)
+
+    dict_for_files_nuc = {}
+    dict_for_file_var = {}
+
+    # All counts
+    for file in list_inputs:
+        # iterate over input files
+        sequences = dico(file, path_inputs)
+        
+        # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS
+        # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only
+        # This causes a crash here (in the ratios function). The correction skip the whole file.
+        # When CDS_Search is corrected, lines with 'skip' can be removed
+        skip = False
+        for key in sequences.keys():
+            if all(x == '-' for x in sequences[key]):
+                skip = True
+
+        if not skip:
+
+            nuc_counts_per_seq = {}
+            nuc_var_per_seq = {}
+
+            for key in sequences.keys():
+                # iterate over sequences in the file
+                nuc_counts_per_seq[key] = all_nuc_counts(sequences[key])
+                nuc_var_per_seq[key] = ratios(nuc_counts_per_seq[key])
+
+            # Add NaN for missing species
+            for key in set(lsp).difference(set(sequences.keys())):
+                nuc_counts_per_seq[key] = fill_with_NaN(LN)
+                nuc_var_per_seq[key] = fill_with_NaN(Lratios)
+
+            # Add computations to final dicts
+            dict_for_files_nuc[file] = nuc_counts_per_seq
+            dict_for_file_var[file] = nuc_var_per_seq
+
+
+    # Try with pandas ?
+    write_output(LN, flsp, path_outputs_1, dict_for_files_nuc) # one file per nuc
+    write_output(Lratios, flsp, path_outputs_2, dict_for_file_var) # one file per nuc_variable
+
+if __name__ == '__main__':
+    main()