changeset 1:cf50aeb956f2 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
author iuc
date Wed, 20 Apr 2022 16:59:05 +0000
parents ba95715078c9
children
files macros.xml test-data/filter-out1.fasta tn93_cluster.py tn93_filter.py tn93_filter.xml
diffstat 5 files changed, 77 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/macros.xml	Fri Apr 23 03:05:08 2021 +0000
+++ b/macros.xml	Wed Apr 20 16:59:05 2022 +0000
@@ -1,6 +1,12 @@
 <?xml version="1.0"?>
 <macros>
-    <token name="@VERSION@">1.0.6</token>
+    <token name="@TOOL_VERSION@">1.0.6</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">tn93</requirement>
+            <yield />
+        </requirements>
+    </xml>
     <xml name="citations">
         <citations>
             <citation type="bibtex">
--- a/test-data/filter-out1.fasta	Fri Apr 23 03:05:08 2021 +0000
+++ b/test-data/filter-out1.fasta	Wed Apr 20 16:59:05 2022 +0000
@@ -13,3 +13,8 @@
 >gb_MW518841_Organism_Severe_acute_respiratory_syndrome_coronavirus_2_Strain_Name_SARS_CoV_2_human_USA_CA_CDC_STM_220_2020_Segment_null_1
 ATGTTAGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTTGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATCATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTTAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGGTGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACA
 
+
+>epi_isl_1041403/hCoV-19/USA/NY-PRL-2021_02_08_05H08/2021
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+>REFERENCE
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAGACG
\ No newline at end of file
--- a/tn93_cluster.py	Fri Apr 23 03:05:08 2021 +0000
+++ b/tn93_cluster.py	Wed Apr 20 16:59:05 2022 +0000
@@ -2,7 +2,6 @@
 import json
 import os
 import shlex
-import shutil
 import subprocess
 import sys
 
@@ -41,27 +40,22 @@
 def main(arguments):
     threshold = arguments.threshold
     step = threshold * 0.25
-    shutil.copy(arguments.input, os.path.join(os.getcwd(), 'reference_msa.fa'))
-    shutil.copy(arguments.input, os.path.join(os.getcwd(), 'reference_msa.fa.bak'))
     with open(arguments.reference) as fh:
         for line in fh:
             if line[0] == '>':
                 _ref_seq_name = line[1:].split(' ')[0].strip()
                 break
-    while True and threshold <= 1:
-        command = 'tn93-cluster -o clusters.json -t %g -a %s -c %s -m json -l %d -g %f reference_msa.fa' % (threshold, arguments.ambigs, arguments.cluster_type, arguments.overlap, arguments.fraction)
+    while threshold <= 1:
+        command = 'tn93-cluster -o clusters.json -t %g -a %s -c %s -m json -l %d -g %f %s' % (threshold, arguments.ambigs, arguments.cluster_type, arguments.overlap, arguments.fraction, arguments.input)
         return_code = run_command(command)
         if return_code != 0:
             return return_code
-        input_stamp, cluster_count = cluster_to_fasta('clusters.json', 'reference_msa.fa.bak', _ref_seq_name)
-        if cluster_count <= arguments.cluster_count or threshold == 1:
+        input_stamp, cluster_count = cluster_to_fasta('clusters.json', 'clusters.fa', _ref_seq_name)
+        if cluster_count <= arguments.cluster_count:
             break
         else:
             threshold += step
         print('Found %d clusters at threshold %f' % (cluster_count, threshold))
-    shutil.copy('reference_msa.fa.bak', arguments.compressed)
-    shutil.copy('clusters.json', arguments.output)
-    os.remove('reference_msa.fa.bak')
     return 0
 
 
--- a/tn93_filter.py	Fri Apr 23 03:05:08 2021 +0000
+++ b/tn93_filter.py	Wed Apr 20 16:59:05 2022 +0000
@@ -1,5 +1,6 @@
 import argparse
 import csv
+import random
 
 from Bio import SeqIO
 
@@ -8,15 +9,22 @@
 arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str)
 arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str)
 arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str)
-arguments.add_argument('-q', '--clusters', help='Compressed clusters', required=True, type=str)
+arguments.add_argument('-q', '--clusters', help='Compressed background clusters', required=True, type=str)
 settings = arguments.parse_args()
 
 reference_name = 'REFERENCE'
 reference_seq = ''
 
+
+def unique_id(new_id, existing_ids):
+    while new_id in existing_ids:
+        new_id += '_' + ''.join(random.choices('0123456789abcdef', k=10))
+    return new_id
+
+
 with open(settings.reference) as seq_fh:
     for seq_record in SeqIO.parse(seq_fh, 'fasta'):
-        reference_name = seq_record.name
+        reference_name = seq_record.name.split(' ')[0]
         reference_seq = seq_record.seq
         break
 
@@ -27,17 +35,19 @@
     for line in reader:
         if line[1] not in seqs_to_filter:
             seqs_to_filter.add(line[1])
+        else:
+            seqs_to_filter.add(unique_id(line[1], seqs_to_filter))
     if reference_name in seqs_to_filter:
         seqs_to_filter.remove(reference_name)
 
 with open(settings.reads, "a+") as fh:
     seqs_filtered = list()
     for seq_record in SeqIO.parse(settings.clusters, "fasta"):
+        if seq_record.name.split(' ')[0] == reference_name:
+            continue
         if seq_record.name not in seqs_to_filter:
-            if seq_record.name == reference_name:
-                if seq_record.name not in seqs_filtered:
-                    seqs_filtered.append(seq_record.name)
-                else:
-                    continue
+            unique_name = unique_id(seq_record.name, seqs_filtered)
+            fh.write('\n>%s\n%s' % (unique_name, seq_record.seq))
+            seqs_filtered.append(unique_name)
     if reference_name not in seqs_filtered:
         fh.write('\n>REFERENCE\n%s' % reference_seq)
--- a/tn93_filter.xml	Fri Apr 23 03:05:08 2021 +0000
+++ b/tn93_filter.xml	Wed Apr 20 16:59:05 2022 +0000
@@ -1,12 +1,11 @@
-<tool id="tn93_filter" name="TN93 Filter" version="@VERSION@">
+<tool id="tn93_filter" name="TN93 Filter" version="@TOOL_VERSION@+galaxy1">
     <description>- remove sequences from a reference that are within a given distance of of a cluster</description>
     <macros>
         <import>macros.xml</import>
     </macros>
-    <requirements>
-        <requirement type="package" version="@VERSION@">tn93</requirement>
+    <expand macro="requirements">
         <requirement type="package" version="1.70">biopython</requirement>
-    </requirements>
+    </expand>
     <version_command><![CDATA[tn93 --version]]></version_command>
     <command detect_errors="exit_code"><![CDATA[
     cat '$reads' > filtered_msa.fa &&
@@ -20,7 +19,7 @@
         <param name="reads" type="data" format="fasta" label="Aligned reads" />
         <param name="reference" type="data" format="fasta" label="Reference sequence" />
         <param name="clusters" type="data" format="fasta" label="FASTA file with compressed clusters" />
-        <param name="threshold" type="float" value="0.015" label="Pairwise distance threshold" />
+        <param name="threshold" type="float" value="0.015" min="0" max="1" label="Pairwise distance threshold" />
     </inputs>
     <outputs>
         <data name="filtered_reference" format="fasta" from_work_dir="filtered_msa.fa" />
@@ -35,11 +34,47 @@
         </test>
     </tests>
     <help><![CDATA[
-TN93-Filter
-===========
+TN93
+----
+
+This is a simple program meant to compute pairwise distances between aligned 
+nucleotide sequences in sequential FASTA format using the 
+[Tamura Nei 93 distance](http://www.ncbi.nlm.nih.gov/pubmed/8336541).
+
+USAGE
+-----
+
+    usage: tn93 [-h] [-o OUTPUT] [-t THRESHOLD] [-a AMBIGS] [-l OVERLAP][-d COUNTS_IN_NAME] [-f FORMAT] [-s SECOND_FASTA] [-b] [-c] [-q] [FASTA]
+
+Try it from using the example file in 'data' by typing 
+
+    tn93 -t 0.05 -o data/test.dst data/test.fas
+
+Output (diagnostics written to stderr, histogram written to stdout so can be redirected)
+
+Example:
 
-Removes aligned sequences that are within a given distance from a reference
-sequence using the 1993 Tamura-Nei distance calculation.
+    Read 8 sequences of length 1320
+    Will perform 28 pairwise distance calculations
+    Progress:     100% (       7 links found,          inf evals/sec)
+    {
+        "Actual comparisons performed" :28,
+        "Total comparisons possible" : 28,
+        "Links found" : 7,
+        "Maximum distance" : 0.0955213,
+        "Mean distance" : 0.0644451,
+        "Histogram" : [[0.005,0],[0.01,0],[0.015,0],[0.02,0],[0.025,0],[0.03,2],[0.035,1],[0.04,0],[0.045,1],[0.05,3],[0.055,1],[0.06,2],[0.065,2],[0.07,3],[0.075,4],[0.08,3],[0.085,3],[0.09,2],[0.095,0],[0.1,1],[0.105,0],[0.11,0],[0.115,0],[0.12,0],[0.125,0],[0.13,0],[0.135,0],[0.14,0],[0.145,0],[0.15,0],[0.155,0],[0.16,0],[0.165,0],[0.17,0],[0.175,0],[0.18,0],[0.185,0],[0.19,0],[0.195,0],[0.2,0],[0.205,0],[0.21,0],[0.215,0],[0.22,0],[0.225,0],[0.23,0],[0.235,0],[0.24,0],[0.245,0],[0.25,0],[0.255,0],[0.26,0],[0.265,0],[0.27,0],[0.275,0],[0.28,0],[0.285,0],[0.29,0],[0.295,0],[0.3,0],[0.305,0],[0.31,0],[0.315,0],[0.32,0],[0.325,0],[0.33,0],[0.335,0],[0.34,0],[0.345,0],[0.35,0],[0.355,0],[0.36,0],[0.365,0],[0.37,0],[0.375,0],[0.38,0],[0.385,0],[0.39,0],[0.395,0],[0.4,0],[0.405,0],[0.41,0],[0.415,0],[0.42,0],[0.425,0],[0.43,0],[0.435,0],[0.44,0],[0.445,0],[0.45,0],[0.455,0],[0.46,0],[0.465,0],[0.47,0],[0.475,0],[0.48,0],[0.485,0],[0.49,0],[0.495,0],[0.5,0],[0.505,0],[0.51,0],[0.515,0],[0.52,0],[0.525,0],[0.53,0],[0.535,0],[0.54,0],[0.545,0],[0.55,0],[0.555,0],[0.56,0],[0.565,0],[0.57,0],[0.575,0],[0.58,0],[0.585,0],[0.59,0],[0.595,0],[0.6,0],[0.605,0],[0.61,0],[0.615,0],[0.62,0],[0.625,0],[0.63,0],[0.635,0],[0.64,0],[0.645,0],[0.65,0],[0.655,0],[0.66,0],[0.665,0],[0.67,0],[0.675,0],[0.68,0],[0.685,0],[0.69,0],[0.695,0],[0.7,0],[0.705,0],[0.71,0],[0.715,0],[0.72,0],[0.725,0],[0.73,0],[0.735,0],[0.74,0],[0.745,0],[0.75,0],[0.755,0],[0.76,0],[0.765,0],[0.77,0],[0.775,0],[0.78,0],[0.785,0],[0.79,0],[0.795,0],[0.8,0],[0.805,0],[0.81,0],[0.815,0],[0.82,0],[0.825,0],[0.83,0],[0.835,0],[0.84,0],[0.845,0],[0.85,0],[0.855,0],[0.86,0],[0.865,0],[0.87,0],[0.875,0],[0.88,0],[0.885,0],[0.89,0],[0.895,0],[0.9,0],[0.905,0],[0.91,0],[0.915,0],[0.92,0],[0.925,0],[0.93,0],[0.935,0],[0.94,0],[0.945,0],[0.95,0],[0.955,0],[0.96,0],[0.965,0],[0.97,0],[0.975,0],[0.98,0],[0.985,0],[0.99,0],[0.995,0],[1,0]]
+    }
+    
+NOTES
+-----
+
+All sequences must be aligned and have the same length.  Only IUPAC characters are recognized (e.g. no ~). Sequence names can include copy number as in 
+
+    >seqname:10
+
+':' can be replaced with another character using `-d`, and sequences that have no explicit copy number are assumed to be a single copy. Copy numbers 
+only affect histogram and mean calculations.
 ]]></help>
     <expand macro="citations">
         <citation type="doi">10.1093/oxfordjournals.molbev.a040023</citation>