Mercurial > repos > abims-sbr > mutcount
view scripts/S01b_extract_variable_nuc.py @ 0:acc3674e515b draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:28:50 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python #coding: utf-8 import argparse, os from functions import dico, write_output, fill_with_NaN """ Functions for nucleic format """ def all_nuc_counts(seq): """ Counts the number of nucleotides in a sequence Args : - seq (String) : a nucleic sequence Return: - nuc_counts (dict) : counts on nucleotides (key/value : nucleotide/count) """ nuc_counts = {} seqU = seq.upper() LN =['A','C','T','G'] for base in LN: nuc_counts[base] = seqU.count(base) return nuc_counts def ratios(nuc_counts): """ Compute GC and purine ratios from the counts of nucleotides in a sequence Args : - nuc_counts (dict) : dictionary of nucleotides counts (output of all_nuc_counts()) Return : - ratios (dict) : dictionary with ratios """ # Search for compositional bias in genome as marker of thermal adaptation ratios = {} ratios['GC_percent'] = float(nuc_counts['C'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) ratios['purine_percent'] = float(nuc_counts['A'] + nuc_counts['G'])/(sum(nuc_counts.values())*100) ratios['DIFF_GC'] = nuc_counts['G'] - nuc_counts['C'] ratios['DIFF_AT'] = nuc_counts['A'] - nuc_counts['T'] # Per bp ratios['PLI_GC'] = float(ratios['DIFF_GC'])/sum(nuc_counts.values()) ratios['PLI_AT'] = float(ratios['DIFF_AT'])/sum(nuc_counts.values()) # Per 1000 bp ratios['PLI_GC_1000'] = ratios['PLI_GC']*1000 ratios['PLI_AT_1000'] = ratios['PLI_AT']*1000 return ratios def main(): parser = argparse.ArgumentParser() parser.add_argument("species_list", help="List of species separated by commas") args = parser.parse_args() LN =['A','C','T','G'] Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000'] list_inputs = [] path_inputs = '01_input_files' list_inputs = os.listdir(path_inputs) lsp = args.species_list.split(',') lsp = sorted(lsp) flsp = '' for el in lsp: flsp += el+',' path_outputs_1 = '02_tables_per_nucleotide' path_outputs_2 = '02_tables_per_nuc_variable' os.mkdir(path_outputs_1) os.mkdir(path_outputs_2) dict_for_files_nuc = {} dict_for_file_var = {} # All counts for file in list_inputs: # iterate over input files sequences = dico(file, path_inputs) # TEMPORARY CORRECTION FOR SEQUENCES CONTAINING ONLY INDELS # It appears than CDS_Search can bug sometimes and return an alignement where a species' sequence is made of indels only # This causes a crash here (in the ratios function). The correction skip the whole file. # When CDS_Search is corrected, lines with 'skip' can be removed skip = False for key in sequences.keys(): if all(x == '-' for x in sequences[key]): skip = True if not skip: nuc_counts_per_seq = {} nuc_var_per_seq = {} for key in sequences.keys(): # iterate over sequences in the file nuc_counts_per_seq[key] = all_nuc_counts(sequences[key]) nuc_var_per_seq[key] = ratios(nuc_counts_per_seq[key]) # Add NaN for missing species for key in set(lsp).difference(set(sequences.keys())): nuc_counts_per_seq[key] = fill_with_NaN(LN) nuc_var_per_seq[key] = fill_with_NaN(Lratios) # Add computations to final dicts dict_for_files_nuc[file] = nuc_counts_per_seq dict_for_file_var[file] = nuc_var_per_seq # Try with pandas ? write_output(LN, flsp, path_outputs_1, dict_for_files_nuc) # one file per nuc write_output(Lratios, flsp, path_outputs_2, dict_for_file_var) # one file per nuc_variable if __name__ == '__main__': main()