Mercurial > repos > abims-sbr > mutcount
diff scripts/S01b_extract_variable_nuc.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:28:50 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01b_extract_variable_nuc.py Fri Feb 01 10:28:50 2019 -0500 @@ -0,0 +1,121 @@ +#!/usr/bin/env python +#coding: utf-8 + +import argparse, os +from functions import dico, write_output, fill_with_NaN + +""" Functions for nucleic format """ + +def all_nuc_counts(seq): + """ Counts the number of nucleotides in a sequence + + Args : + - seq (String) : a nucleic sequence + + Return: + - nuc_counts (dict) : counts on nucleotides (key/value : nucleotide/count) + """ + nuc_counts = {} + seqU = seq.upper() + LN =['A','C','T','G'] + + for base in LN: + nuc_counts[base] = seqU.count(base) + + return nuc_counts + +def ratios(nuc_counts): + """ Compute GC and purine ratios from the counts of nucleotides in a sequence + + Args : + - nuc_counts (dict) : dictionary of nucleotides counts (output of all_nuc_counts()) + + Return : + - ratios (dict) : dictionary with ratios + """ + + # Search for compositional bias in genome as marker of thermal adaptation + ratios = {} + ratios['GC_percent'] = float(nuc_counts['C'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) + ratios['purine_percent'] = float(nuc_counts['A'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) + + ratios['DIFF_GC'] = nuc_counts['G'] - nuc_counts['C'] + ratios['DIFF_AT'] = nuc_counts['A'] - nuc_counts['T'] + + # Per bp + ratios['PLI_GC'] = float(ratios['DIFF_GC'])/sum(nuc_counts.values()) + ratios['PLI_AT'] = float(ratios['DIFF_AT'])/sum(nuc_counts.values()) + + # Per 1000 bp + ratios['PLI_GC_1000'] = ratios['PLI_GC']*1000 + ratios['PLI_AT_1000'] = ratios['PLI_AT']*1000 + + return ratios + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("species_list", help="List of species separated by commas") + args = parser.parse_args() + + LN =['A','C','T','G'] + Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000'] + + list_inputs = [] + + path_inputs = '01_input_files' + list_inputs = os.listdir(path_inputs) + + lsp = args.species_list.split(',') + lsp = sorted(lsp) + flsp = '' + for el in lsp: + flsp += el+',' + + path_outputs_1 = '02_tables_per_nucleotide' + path_outputs_2 = '02_tables_per_nuc_variable' + os.mkdir(path_outputs_1) + os.mkdir(path_outputs_2) + + dict_for_files_nuc = {} + dict_for_file_var = {} + + # All counts + for file in list_inputs: + # iterate over input files + sequences = dico(file, path_inputs) + + # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS + # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only + # This causes a crash here (in the ratios function). The correction skip the whole file. + # When CDS_Search is corrected, lines with 'skip' can be removed + skip = False + for key in sequences.keys(): + if all(x == '-' for x in sequences[key]): + skip = True + + if not skip: + + nuc_counts_per_seq = {} + nuc_var_per_seq = {} + + for key in sequences.keys(): + # iterate over sequences in the file + nuc_counts_per_seq[key] = all_nuc_counts(sequences[key]) + nuc_var_per_seq[key] = ratios(nuc_counts_per_seq[key]) + + # Add NaN for missing species + for key in set(lsp).difference(set(sequences.keys())): + nuc_counts_per_seq[key] = fill_with_NaN(LN) + nuc_var_per_seq[key] = fill_with_NaN(Lratios) + + # Add computations to final dicts + dict_for_files_nuc[file] = nuc_counts_per_seq + dict_for_file_var[file] = nuc_var_per_seq + + + # Try with pandas ? + write_output(LN, flsp, path_outputs_1, dict_for_files_nuc) # one file per nuc + write_output(Lratios, flsp, path_outputs_2, dict_for_file_var) # one file per nuc_variable + +if __name__ == '__main__': + main()