changeset 0:bca6b5cb87cd draft default tip

"planemo upload for repository https://github.com/phac-nml/promer commit 09ae227a84e31c9c56f58815b1a0c8c0e0a7900e"
author nml
date Tue, 17 Dec 2019 15:55:27 -0500
parents
children
files promer4_substitions.xml promer_substitutions.py test-data/contigs.fasta test-data/query.fasta test-data/reference.fasta test-data/snps.tab
diffstat 6 files changed, 246 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/promer4_substitions.xml	Tue Dec 17 15:55:27 2019 -0500
@@ -0,0 +1,59 @@
+<tool id="promer4_substitutions" name="report substitutions" version="1.2">
+    <description>aligns two sets of contigs and reports amino acid substitutions between them</description>
+    <requirements>
+          <requirement type="package" version="3.5.6">python</requirement>
+          <requirement type="package" version="4.0.0beta2">mummer4</requirement>
+          <requirement type="package" version="1.68">biopython</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+
+        python '$__tool_directory__/promer_substitutions.py' '$reference_fasta' '$query_fasta'
+
+    ]]></command>
+
+    <inputs>
+        <param name="reference_fasta" type="data" format="fasta" optional="false" label="Reference FASTA"/>
+        <param name="query_fasta" type="data" format="fasta" optional="false" label="Query FASTA" />
+    </inputs>
+
+    <outputs>
+        <data format="tabular" label="substitutions" name="output_substitutions" from_work_dir="snps.tab" />
+        <data format="fasta" label="contigs" name="output_contigs" from_work_dir="contigs.fasta" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="query_fasta" value="query.fasta" />
+            <param name="reference_fasta" value="reference.fasta" />
+            <output name="output_substitutions" file="snps.tab" ftype="tabular" />
+            <output name="output_contigs" file="contigs.fasta" ftype="fasta" />
+        </test>
+    </tests>
+
+    <help><![CDATA[
+Report Substitutions
+=====================
+
+Aligns a set of query contigs against a set of reference contigs and reports the amino acid differences between the two sets of contigs using MUMmer4 (promer):
+
+Kurtz, Stefan et al. “Versatile and Open Software for Comparing Large Genomes.” Genome Biology 5.2 (2004): R12. PMC. Web. 22 Aug. 2018.
+
+The alignments are performed in amino acid space. Only the substitutions from the best set of global alignments are reported in the output.
+
+------
+Inputs
+------
+
+1. Query FASTA: A nucleotide or amino acid FASTA query file.
+2. Reference FASTA: A nucleotide or amino acid FASTA reference file.
+
+-------
+Outputs
+-------
+
+1. substitions: A tabular file detailing subsitutions between aligned contigs.
+
+]]></help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/promer_substitutions.py	Tue Dec 17 15:55:27 2019 -0500
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+import sys
+
+from Bio import SeqIO
+from Bio.SeqIO import FastaIO
+
+"""
+# =============================================================================
+Copyright Government of Canada 2018
+Written by: Eric Marinier, Public Health Agency of Canada,
+    National Microbiology Laboratory
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License. You may obtain a copy of the
+License at:
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+# =============================================================================
+"""
+
+__version__ = '0.2.0'
+
+FASTA_DIRECTORY = "fasta"
+
+REF_START = 2
+REF_END = 3
+
+HEADER_ROW = "[P1]\t[SUB]\t[SUB]\t[P2]\t[BUFF]\t[DIST]\
+    \t[R]\t[Q]\t[FRM]\t[FRM]\t[TAG]\t[TAG]\n"
+
+
+def reorient_file(fasta_location, start, end):
+
+    record = list(SeqIO.parse(fasta_location, "fasta"))[0]
+
+    # reversed
+    if start > end:
+        record.seq = record.seq[(end - 1):start].reverse_complement()
+
+    # same orientation
+    else:
+        record.seq = record.seq[(start - 1):end]
+
+    SeqIO.write(record, fasta_location, "fasta")
+
+
+def promer(reference_location, query_location):
+
+    # promer
+    subprocess.check_output(
+        ['promer', reference_location, query_location],
+        universal_newlines=True)
+
+
+# File locations from arguments:
+reference_location = sys.argv[1]
+query_location = sys.argv[2]
+
+# Make directories:
+if not os.path.exists(FASTA_DIRECTORY):
+    os.mkdir(FASTA_DIRECTORY)
+
+# Read query FASTA:
+query = list(SeqIO.parse(query_location, "fasta"))
+fasta_locations = []
+
+# Prepare final output:
+snps_file = open("snps.tab", "w")
+snps_file.write(HEADER_ROW)
+
+# Split the query FASTA file into multiple files, each with one record:
+# (This is done to work around a bug in promer.)
+for record in query:
+
+    output_name = str(record.id) + ".fasta"
+    output_location = os.path.join(FASTA_DIRECTORY, output_name)
+
+    fasta_locations.append(output_location)
+    SeqIO.write(record, output_location, "fasta")
+
+# Run promer on each (new) FASTA file:
+for fasta_location in fasta_locations:
+
+    promer(reference_location, fasta_location)
+
+    # show-coords
+    output = subprocess.check_output(
+        ['show-coords', '-THrcl', 'out.delta'], universal_newlines=True)
+
+    if not output:
+        continue
+
+    ref_start = int(output.split()[REF_START])
+    ref_end = int(output.split()[REF_END])
+
+    reorient_file(fasta_location, ref_start, ref_end)
+    promer(reference_location, fasta_location)
+
+    # show snps
+    output = str(subprocess.check_output(
+        ['show-snps', '-T', '-q', 'out.delta'], universal_newlines=True))
+    output = output.split("\n")[4:]
+    output = "\n".join(output)
+
+    snps_file.write(output)
+
+snps_file.close()
+
+# Write all FASTA files into one output:
+output_location = "contigs.fasta"
+records = []
+
+for fasta_location in fasta_locations:
+
+    record = list(SeqIO.parse(fasta_location, "fasta"))[0]
+    records.append(record)
+
+contigs_file = open(output_location, 'w')
+fasta_writer = FastaIO.FastaWriter(contigs_file, wrap=None)
+fasta_writer.write_file(records)
+contigs_file.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/contigs.fasta	Tue Dec 17 15:55:27 2019 -0500
@@ -0,0 +1,24 @@
+>NODE_1_length_6
+GATACA
+>NODE_2_length_6
+GATACA
+>NODE_3_length_6
+GATACA
+>NODE_4_length_6
+GATACA
+>NODE_5_length_6
+GATACA
+>NODE_6_length_6
+GATACA
+>NODE_7_length_6
+GATACA
+>NODE_8_length_6
+GATACA
+>NODE_9_length_6
+GATACA
+>NODE_10_length_6
+GATACA
+>NODE_11_length_6
+GATACA
+>NODE_12_length_6
+GATACA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/query.fasta	Tue Dec 17 15:55:27 2019 -0500
@@ -0,0 +1,24 @@
+>NODE_1_length_6
+GATACA
+>NODE_2_length_6
+GATACA
+>NODE_3_length_6
+GATACA
+>NODE_4_length_6
+GATACA
+>NODE_5_length_6
+GATACA
+>NODE_6_length_6
+GATACA
+>NODE_7_length_6
+GATACA
+>NODE_8_length_6
+GATACA
+>NODE_9_length_6
+GATACA
+>NODE_10_length_6
+GATACA
+>NODE_11_length_6
+GATACA
+>NODE_12_length_6
+GATACA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reference.fasta	Tue Dec 17 15:55:27 2019 -0500
@@ -0,0 +1,12 @@
+>NODE_1_length_300
+CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA
+>NODE_2_length_300
+CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA
+>NODE_3_length_300
+CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA
+>NODE_4_length_300
+CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA
+>NODE_5_length_300
+CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA
+>NODE_6_length_300
+CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/snps.tab	Tue Dec 17 15:55:27 2019 -0500
@@ -0,0 +1,1 @@
+[P1]	[SUB]	[SUB]	[P2]	[BUFF]	[DIST]    	[R]	[Q]	[FRM]	[FRM]	[TAG]	[TAG]