comparison scripts/S01b_extract_variable_nuc.py @ 0:acc3674e515b draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:28:50 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc3674e515b
1 #!/usr/bin/env python
2 #coding: utf-8
3
4 import argparse, os
5 from functions import dico, write_output, fill_with_NaN
6
7 """ Functions for nucleic format """
8
9 def all_nuc_counts(seq):
10 """ Counts the number of nucleotides in a sequence
11
12 Args :
13 - seq (String) : a nucleic sequence
14
15 Return:
16 - nuc_counts (dict) : counts on nucleotides (key/value : nucleotide/count)
17 """
18 nuc_counts = {}
19 seqU = seq.upper()
20 LN =['A','C','T','G']
21
22 for base in LN:
23 nuc_counts[base] = seqU.count(base)
24
25 return nuc_counts
26
27 def ratios(nuc_counts):
28 """ Compute GC and purine ratios from the counts of nucleotides in a sequence
29
30 Args :
31 - nuc_counts (dict) : dictionary of nucleotides counts (output of all_nuc_counts())
32
33 Return :
34 - ratios (dict) : dictionary with ratios
35 """
36
37 # Search for compositional bias in genome as marker of thermal adaptation
38 ratios = {}
39 ratios['GC_percent'] = float(nuc_counts['C'] + nuc_counts['G'])/(sum(nuc_counts.values())*100)
40 ratios['purine_percent'] = float(nuc_counts['A'] + nuc_counts['G'])/(sum(nuc_counts.values())*100)
41
42 ratios['DIFF_GC'] = nuc_counts['G'] - nuc_counts['C']
43 ratios['DIFF_AT'] = nuc_counts['A'] - nuc_counts['T']
44
45 # Per bp
46 ratios['PLI_GC'] = float(ratios['DIFF_GC'])/sum(nuc_counts.values())
47 ratios['PLI_AT'] = float(ratios['DIFF_AT'])/sum(nuc_counts.values())
48
49 # Per 1000 bp
50 ratios['PLI_GC_1000'] = ratios['PLI_GC']*1000
51 ratios['PLI_AT_1000'] = ratios['PLI_AT']*1000
52
53 return ratios
54
55 def main():
56 parser = argparse.ArgumentParser()
57 parser.add_argument("species_list", help="List of species separated by commas")
58 args = parser.parse_args()
59
60 LN =['A','C','T','G']
61 Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000']
62
63 list_inputs = []
64
65 path_inputs = '01_input_files'
66 list_inputs = os.listdir(path_inputs)
67
68 lsp = args.species_list.split(',')
69 lsp = sorted(lsp)
70 flsp = ''
71 for el in lsp:
72 flsp += el+','
73
74 path_outputs_1 = '02_tables_per_nucleotide'
75 path_outputs_2 = '02_tables_per_nuc_variable'
76 os.mkdir(path_outputs_1)
77 os.mkdir(path_outputs_2)
78
79 dict_for_files_nuc = {}
80 dict_for_file_var = {}
81
82 # All counts
83 for file in list_inputs:
84 # iterate over input files
85 sequences = dico(file, path_inputs)
86
87 # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS
88 # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only
89 # This causes a crash here (in the ratios function). The correction skip the whole file.
90 # When CDS_Search is corrected, lines with 'skip' can be removed
91 skip = False
92 for key in sequences.keys():
93 if all(x == '-' for x in sequences[key]):
94 skip = True
95
96 if not skip:
97
98 nuc_counts_per_seq = {}
99 nuc_var_per_seq = {}
100
101 for key in sequences.keys():
102 # iterate over sequences in the file
103 nuc_counts_per_seq[key] = all_nuc_counts(sequences[key])
104 nuc_var_per_seq[key] = ratios(nuc_counts_per_seq[key])
105
106 # Add NaN for missing species
107 for key in set(lsp).difference(set(sequences.keys())):
108 nuc_counts_per_seq[key] = fill_with_NaN(LN)
109 nuc_var_per_seq[key] = fill_with_NaN(Lratios)
110
111 # Add computations to final dicts
112 dict_for_files_nuc[file] = nuc_counts_per_seq
113 dict_for_file_var[file] = nuc_var_per_seq
114
115
116 # Try with pandas ?
117 write_output(LN, flsp, path_outputs_1, dict_for_files_nuc) # one file per nuc
118 write_output(Lratios, flsp, path_outputs_2, dict_for_file_var) # one file per nuc_variable
119
120 if __name__ == '__main__':
121 main()