Mercurial > repos > iuc > hyphy_fel
diff scripts/strike-ambigs.bf @ 24:389fd0e51bcb draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hyphy/ commit 2742ee3b4e90f65352845265d2f85c4263e0eabb"
author | iuc |
---|---|
date | Tue, 20 Apr 2021 10:28:22 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/strike-ambigs.bf Tue Apr 20 10:28:22 2021 +0000 @@ -0,0 +1,83 @@ +RequireVersion ("2.5.20"); + +LoadFunctionLibrary ("libv3/tasks/alignments.bf"); +LoadFunctionLibrary ("libv3/tasks/trees.bf"); +LoadFunctionLibrary ("libv3/UtilityFunctions.bf"); +LoadFunctionLibrary ("libv3/IOFunctions.bf"); +LoadFunctionLibrary ("libv3/convenience/math.bf"); + + + +filter.analysis_description = {terms.io.info : + " + Read an alignment of coding sequences and replace any ambiguous codons with ---. Write results to a new file in FASTA format, and report changed sequences to stdout + ", + terms.io.version : "0.1", + terms.io.reference : "TBD", + terms.io.authors : "Sergei L Kosakovsky Pond", + terms.io.contact : "spond@temple.edu", + terms.io.requirements : "An MSA" + }; + + +io.DisplayAnalysisBanner (filter.analysis_description); + +utility.SetEnvVariable ("NORMALIZE_SEQUENCE_NAMES", FALSE); + +KeywordArgument ("code", "Which genetic code should be used", "Universal"); +KeywordArgument ("alignment", "An in-frame codon alignment in one of the formats supported by HyPhy"); + +filter.in = alignments.PromptForGeneticCodeAndAlignment ("filter.dataset", "filter.input"); + +KeywordArgument ("output", ".fasta for compressed data", None); +filter.out = io.PromptUserForFilePath(".fasta for filtered data"); +fprintf (filter.out, CLEAR_FILE, KEEP_OPEN); + +GetDataInfo (filter.site_patterns, filter.input); + +filter.patter2site = {}; + + +for (i,j,v; in; filter.site_patterns) { + index = i+j; + if (filter.patter2site / v == FALSE ) { + filter.patter2site [v] = {}; + } + filter.patter2site [v] + index; +} + +GET_DATA_INFO_RETURNS_ONLY_THE_INDEX = TRUE; +COUNT_GAPS_IN_FREQUENCIES = FALSE; +filter.unique_patterns = utility.Array1D (filter.input.site_freqs); + +for (seq = 0; seq < filter.input.species; seq += 1) { + io.ReportProgressBar ("filter","Processing sequence " + (1+seq)); + codons = {1, filter.input.sites}; + codons [0] = ""; + GetString (seq_name, filter.input, seq); + GetDataInfo (seq_chars, filter.input, seq); + + filter.ambigs = 0; + + for (pattern = 0; pattern < filter.unique_patterns; pattern += 1) { + GetDataInfo (pattern_info, filter.input, seq, pattern); + if (pattern_info >= 0) { + codon_start = (filter.patter2site[pattern])[0] * 3; + codon = seq_chars [codon_start][codon_start+2]; + } else { + codon = "---"; + filter.ambigs += Abs (filter.patter2site [pattern]) + } + for (c; in; filter.patter2site [pattern] ) { + codons[c] = codon; + } + } + if (filter.ambigs > 0) { + fprintf (stdout, "\nStriking ", filter.ambigs, " codons that are incompletely resolved from " + seq_name + "\n"); + } + fprintf (filter.out,">",seq_name,"\n",Join ("", codons), "\n"); +} + +fprintf (filter.out,CLOSE_FILE); + +