Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01b_extract_variable_nuc.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:28:50 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc3674e515b |
---|---|
1 #!/usr/bin/env python | |
2 #coding: utf-8 | |
3 | |
4 import argparse, os | |
5 from functions import dico, write_output, fill_with_NaN | |
6 | |
7 """ Functions for nucleic format """ | |
8 | |
9 def all_nuc_counts(seq): | |
10 """ Counts the number of nucleotides in a sequence | |
11 | |
12 Args : | |
13 - seq (String) : a nucleic sequence | |
14 | |
15 Return: | |
16 - nuc_counts (dict) : counts on nucleotides (key/value : nucleotide/count) | |
17 """ | |
18 nuc_counts = {} | |
19 seqU = seq.upper() | |
20 LN =['A','C','T','G'] | |
21 | |
22 for base in LN: | |
23 nuc_counts[base] = seqU.count(base) | |
24 | |
25 return nuc_counts | |
26 | |
27 def ratios(nuc_counts): | |
28 """ Compute GC and purine ratios from the counts of nucleotides in a sequence | |
29 | |
30 Args : | |
31 - nuc_counts (dict) : dictionary of nucleotides counts (output of all_nuc_counts()) | |
32 | |
33 Return : | |
34 - ratios (dict) : dictionary with ratios | |
35 """ | |
36 | |
37 # Search for compositional bias in genome as marker of thermal adaptation | |
38 ratios = {} | |
39 ratios['GC_percent'] = float(nuc_counts['C'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) | |
40 ratios['purine_percent'] = float(nuc_counts['A'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) | |
41 | |
42 ratios['DIFF_GC'] = nuc_counts['G'] - nuc_counts['C'] | |
43 ratios['DIFF_AT'] = nuc_counts['A'] - nuc_counts['T'] | |
44 | |
45 # Per bp | |
46 ratios['PLI_GC'] = float(ratios['DIFF_GC'])/sum(nuc_counts.values()) | |
47 ratios['PLI_AT'] = float(ratios['DIFF_AT'])/sum(nuc_counts.values()) | |
48 | |
49 # Per 1000 bp | |
50 ratios['PLI_GC_1000'] = ratios['PLI_GC']*1000 | |
51 ratios['PLI_AT_1000'] = ratios['PLI_AT']*1000 | |
52 | |
53 return ratios | |
54 | |
55 def main(): | |
56 parser = argparse.ArgumentParser() | |
57 parser.add_argument("species_list", help="List of species separated by commas") | |
58 args = parser.parse_args() | |
59 | |
60 LN =['A','C','T','G'] | |
61 Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000'] | |
62 | |
63 list_inputs = [] | |
64 | |
65 path_inputs = '01_input_files' | |
66 list_inputs = os.listdir(path_inputs) | |
67 | |
68 lsp = args.species_list.split(',') | |
69 lsp = sorted(lsp) | |
70 flsp = '' | |
71 for el in lsp: | |
72 flsp += el+',' | |
73 | |
74 path_outputs_1 = '02_tables_per_nucleotide' | |
75 path_outputs_2 = '02_tables_per_nuc_variable' | |
76 os.mkdir(path_outputs_1) | |
77 os.mkdir(path_outputs_2) | |
78 | |
79 dict_for_files_nuc = {} | |
80 dict_for_file_var = {} | |
81 | |
82 # All counts | |
83 for file in list_inputs: | |
84 # iterate over input files | |
85 sequences = dico(file, path_inputs) | |
86 | |
87 # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS | |
88 # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only | |
89 # This causes a crash here (in the ratios function). The correction skip the whole file. | |
90 # When CDS_Search is corrected, lines with 'skip' can be removed | |
91 skip = False | |
92 for key in sequences.keys(): | |
93 if all(x == '-' for x in sequences[key]): | |
94 skip = True | |
95 | |
96 if not skip: | |
97 | |
98 nuc_counts_per_seq = {} | |
99 nuc_var_per_seq = {} | |
100 | |
101 for key in sequences.keys(): | |
102 # iterate over sequences in the file | |
103 nuc_counts_per_seq[key] = all_nuc_counts(sequences[key]) | |
104 nuc_var_per_seq[key] = ratios(nuc_counts_per_seq[key]) | |
105 | |
106 # Add NaN for missing species | |
107 for key in set(lsp).difference(set(sequences.keys())): | |
108 nuc_counts_per_seq[key] = fill_with_NaN(LN) | |
109 nuc_var_per_seq[key] = fill_with_NaN(Lratios) | |
110 | |
111 # Add computations to final dicts | |
112 dict_for_files_nuc[file] = nuc_counts_per_seq | |
113 dict_for_file_var[file] = nuc_var_per_seq | |
114 | |
115 | |
116 # Try with pandas ? | |
117 write_output(LN, flsp, path_outputs_1, dict_for_files_nuc) # one file per nuc | |
118 write_output(Lratios, flsp, path_outputs_2, dict_for_file_var) # one file per nuc_variable | |
119 | |
120 if __name__ == '__main__': | |
121 main() |