Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01b_extract_variable_nuc.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
| author | abims-sbr |
|---|---|
| date | Fri, 01 Feb 2019 10:28:50 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc3674e515b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #coding: utf-8 | |
| 3 | |
| 4 import argparse, os | |
| 5 from functions import dico, write_output, fill_with_NaN | |
| 6 | |
| 7 """ Functions for nucleic format """ | |
| 8 | |
| 9 def all_nuc_counts(seq): | |
| 10 """ Counts the number of nucleotides in a sequence | |
| 11 | |
| 12 Args : | |
| 13 - seq (String) : a nucleic sequence | |
| 14 | |
| 15 Return: | |
| 16 - nuc_counts (dict) : counts on nucleotides (key/value : nucleotide/count) | |
| 17 """ | |
| 18 nuc_counts = {} | |
| 19 seqU = seq.upper() | |
| 20 LN =['A','C','T','G'] | |
| 21 | |
| 22 for base in LN: | |
| 23 nuc_counts[base] = seqU.count(base) | |
| 24 | |
| 25 return nuc_counts | |
| 26 | |
| 27 def ratios(nuc_counts): | |
| 28 """ Compute GC and purine ratios from the counts of nucleotides in a sequence | |
| 29 | |
| 30 Args : | |
| 31 - nuc_counts (dict) : dictionary of nucleotides counts (output of all_nuc_counts()) | |
| 32 | |
| 33 Return : | |
| 34 - ratios (dict) : dictionary with ratios | |
| 35 """ | |
| 36 | |
| 37 # Search for compositional bias in genome as marker of thermal adaptation | |
| 38 ratios = {} | |
| 39 ratios['GC_percent'] = float(nuc_counts['C'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) | |
| 40 ratios['purine_percent'] = float(nuc_counts['A'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) | |
| 41 | |
| 42 ratios['DIFF_GC'] = nuc_counts['G'] - nuc_counts['C'] | |
| 43 ratios['DIFF_AT'] = nuc_counts['A'] - nuc_counts['T'] | |
| 44 | |
| 45 # Per bp | |
| 46 ratios['PLI_GC'] = float(ratios['DIFF_GC'])/sum(nuc_counts.values()) | |
| 47 ratios['PLI_AT'] = float(ratios['DIFF_AT'])/sum(nuc_counts.values()) | |
| 48 | |
| 49 # Per 1000 bp | |
| 50 ratios['PLI_GC_1000'] = ratios['PLI_GC']*1000 | |
| 51 ratios['PLI_AT_1000'] = ratios['PLI_AT']*1000 | |
| 52 | |
| 53 return ratios | |
| 54 | |
| 55 def main(): | |
| 56 parser = argparse.ArgumentParser() | |
| 57 parser.add_argument("species_list", help="List of species separated by commas") | |
| 58 args = parser.parse_args() | |
| 59 | |
| 60 LN =['A','C','T','G'] | |
| 61 Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000'] | |
| 62 | |
| 63 list_inputs = [] | |
| 64 | |
| 65 path_inputs = '01_input_files' | |
| 66 list_inputs = os.listdir(path_inputs) | |
| 67 | |
| 68 lsp = args.species_list.split(',') | |
| 69 lsp = sorted(lsp) | |
| 70 flsp = '' | |
| 71 for el in lsp: | |
| 72 flsp += el+',' | |
| 73 | |
| 74 path_outputs_1 = '02_tables_per_nucleotide' | |
| 75 path_outputs_2 = '02_tables_per_nuc_variable' | |
| 76 os.mkdir(path_outputs_1) | |
| 77 os.mkdir(path_outputs_2) | |
| 78 | |
| 79 dict_for_files_nuc = {} | |
| 80 dict_for_file_var = {} | |
| 81 | |
| 82 # All counts | |
| 83 for file in list_inputs: | |
| 84 # iterate over input files | |
| 85 sequences = dico(file, path_inputs) | |
| 86 | |
| 87 # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS | |
| 88 # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only | |
| 89 # This causes a crash here (in the ratios function). The correction skip the whole file. | |
| 90 # When CDS_Search is corrected, lines with 'skip' can be removed | |
| 91 skip = False | |
| 92 for key in sequences.keys(): | |
| 93 if all(x == '-' for x in sequences[key]): | |
| 94 skip = True | |
| 95 | |
| 96 if not skip: | |
| 97 | |
| 98 nuc_counts_per_seq = {} | |
| 99 nuc_var_per_seq = {} | |
| 100 | |
| 101 for key in sequences.keys(): | |
| 102 # iterate over sequences in the file | |
| 103 nuc_counts_per_seq[key] = all_nuc_counts(sequences[key]) | |
| 104 nuc_var_per_seq[key] = ratios(nuc_counts_per_seq[key]) | |
| 105 | |
| 106 # Add NaN for missing species | |
| 107 for key in set(lsp).difference(set(sequences.keys())): | |
| 108 nuc_counts_per_seq[key] = fill_with_NaN(LN) | |
| 109 nuc_var_per_seq[key] = fill_with_NaN(Lratios) | |
| 110 | |
| 111 # Add computations to final dicts | |
| 112 dict_for_files_nuc[file] = nuc_counts_per_seq | |
| 113 dict_for_file_var[file] = nuc_var_per_seq | |
| 114 | |
| 115 | |
| 116 # Try with pandas ? | |
| 117 write_output(LN, flsp, path_outputs_1, dict_for_files_nuc) # one file per nuc | |
| 118 write_output(Lratios, flsp, path_outputs_2, dict_for_file_var) # one file per nuc_variable | |
| 119 | |
| 120 if __name__ == '__main__': | |
| 121 main() |
