Mercurial > repos > nml > promer
changeset 0:bca6b5cb87cd draft default tip
"planemo upload for repository https://github.com/phac-nml/promer commit 09ae227a84e31c9c56f58815b1a0c8c0e0a7900e"
author | nml |
---|---|
date | Tue, 17 Dec 2019 15:55:27 -0500 |
parents | |
children | |
files | promer4_substitions.xml promer_substitutions.py test-data/contigs.fasta test-data/query.fasta test-data/reference.fasta test-data/snps.tab |
diffstat | 6 files changed, 246 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/promer4_substitions.xml Tue Dec 17 15:55:27 2019 -0500 @@ -0,0 +1,59 @@ +<tool id="promer4_substitutions" name="report substitutions" version="1.2"> + <description>aligns two sets of contigs and reports amino acid substitutions between them</description> + <requirements> + <requirement type="package" version="3.5.6">python</requirement> + <requirement type="package" version="4.0.0beta2">mummer4</requirement> + <requirement type="package" version="1.68">biopython</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + + python '$__tool_directory__/promer_substitutions.py' '$reference_fasta' '$query_fasta' + + ]]></command> + + <inputs> + <param name="reference_fasta" type="data" format="fasta" optional="false" label="Reference FASTA"/> + <param name="query_fasta" type="data" format="fasta" optional="false" label="Query FASTA" /> + </inputs> + + <outputs> + <data format="tabular" label="substitutions" name="output_substitutions" from_work_dir="snps.tab" /> + <data format="fasta" label="contigs" name="output_contigs" from_work_dir="contigs.fasta" /> + </outputs> + + <tests> + <test> + <param name="query_fasta" value="query.fasta" /> + <param name="reference_fasta" value="reference.fasta" /> + <output name="output_substitutions" file="snps.tab" ftype="tabular" /> + <output name="output_contigs" file="contigs.fasta" ftype="fasta" /> + </test> + </tests> + + <help><![CDATA[ +Report Substitutions +===================== + +Aligns a set of query contigs against a set of reference contigs and reports the amino acid differences between the two sets of contigs using MUMmer4 (promer): + +Kurtz, Stefan et al. “Versatile and Open Software for Comparing Large Genomes.” Genome Biology 5.2 (2004): R12. PMC. Web. 22 Aug. 2018. + +The alignments are performed in amino acid space. Only the substitutions from the best set of global alignments are reported in the output. + +------ +Inputs +------ + +1. Query FASTA: A nucleotide or amino acid FASTA query file. +2. Reference FASTA: A nucleotide or amino acid FASTA reference file. + +------- +Outputs +------- + +1. substitions: A tabular file detailing subsitutions between aligned contigs. + +]]></help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/promer_substitutions.py Tue Dec 17 15:55:27 2019 -0500 @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +import os +import subprocess +import sys + +from Bio import SeqIO +from Bio.SeqIO import FastaIO + +""" +# ============================================================================= +Copyright Government of Canada 2018 +Written by: Eric Marinier, Public Health Agency of Canada, + National Microbiology Laboratory +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at: +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. +# ============================================================================= +""" + +__version__ = '0.2.0' + +FASTA_DIRECTORY = "fasta" + +REF_START = 2 +REF_END = 3 + +HEADER_ROW = "[P1]\t[SUB]\t[SUB]\t[P2]\t[BUFF]\t[DIST]\ + \t[R]\t[Q]\t[FRM]\t[FRM]\t[TAG]\t[TAG]\n" + + +def reorient_file(fasta_location, start, end): + + record = list(SeqIO.parse(fasta_location, "fasta"))[0] + + # reversed + if start > end: + record.seq = record.seq[(end - 1):start].reverse_complement() + + # same orientation + else: + record.seq = record.seq[(start - 1):end] + + SeqIO.write(record, fasta_location, "fasta") + + +def promer(reference_location, query_location): + + # promer + subprocess.check_output( + ['promer', reference_location, query_location], + universal_newlines=True) + + +# File locations from arguments: +reference_location = sys.argv[1] +query_location = sys.argv[2] + +# Make directories: +if not os.path.exists(FASTA_DIRECTORY): + os.mkdir(FASTA_DIRECTORY) + +# Read query FASTA: +query = list(SeqIO.parse(query_location, "fasta")) +fasta_locations = [] + +# Prepare final output: +snps_file = open("snps.tab", "w") +snps_file.write(HEADER_ROW) + +# Split the query FASTA file into multiple files, each with one record: +# (This is done to work around a bug in promer.) +for record in query: + + output_name = str(record.id) + ".fasta" + output_location = os.path.join(FASTA_DIRECTORY, output_name) + + fasta_locations.append(output_location) + SeqIO.write(record, output_location, "fasta") + +# Run promer on each (new) FASTA file: +for fasta_location in fasta_locations: + + promer(reference_location, fasta_location) + + # show-coords + output = subprocess.check_output( + ['show-coords', '-THrcl', 'out.delta'], universal_newlines=True) + + if not output: + continue + + ref_start = int(output.split()[REF_START]) + ref_end = int(output.split()[REF_END]) + + reorient_file(fasta_location, ref_start, ref_end) + promer(reference_location, fasta_location) + + # show snps + output = str(subprocess.check_output( + ['show-snps', '-T', '-q', 'out.delta'], universal_newlines=True)) + output = output.split("\n")[4:] + output = "\n".join(output) + + snps_file.write(output) + +snps_file.close() + +# Write all FASTA files into one output: +output_location = "contigs.fasta" +records = [] + +for fasta_location in fasta_locations: + + record = list(SeqIO.parse(fasta_location, "fasta"))[0] + records.append(record) + +contigs_file = open(output_location, 'w') +fasta_writer = FastaIO.FastaWriter(contigs_file, wrap=None) +fasta_writer.write_file(records) +contigs_file.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/contigs.fasta Tue Dec 17 15:55:27 2019 -0500 @@ -0,0 +1,24 @@ +>NODE_1_length_6 +GATACA +>NODE_2_length_6 +GATACA +>NODE_3_length_6 +GATACA +>NODE_4_length_6 +GATACA +>NODE_5_length_6 +GATACA +>NODE_6_length_6 +GATACA +>NODE_7_length_6 +GATACA +>NODE_8_length_6 +GATACA +>NODE_9_length_6 +GATACA +>NODE_10_length_6 +GATACA +>NODE_11_length_6 +GATACA +>NODE_12_length_6 +GATACA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/query.fasta Tue Dec 17 15:55:27 2019 -0500 @@ -0,0 +1,24 @@ +>NODE_1_length_6 +GATACA +>NODE_2_length_6 +GATACA +>NODE_3_length_6 +GATACA +>NODE_4_length_6 +GATACA +>NODE_5_length_6 +GATACA +>NODE_6_length_6 +GATACA +>NODE_7_length_6 +GATACA +>NODE_8_length_6 +GATACA +>NODE_9_length_6 +GATACA +>NODE_10_length_6 +GATACA +>NODE_11_length_6 +GATACA +>NODE_12_length_6 +GATACA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reference.fasta Tue Dec 17 15:55:27 2019 -0500 @@ -0,0 +1,12 @@ +>NODE_1_length_300 +CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA +>NODE_2_length_300 +CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA +>NODE_3_length_300 +CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA +>NODE_4_length_300 +CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA +>NODE_5_length_300 +CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA +>NODE_6_length_300 +CCAAACTCTGTAAAATCCCTATGATTAGGGACACAGAGTGAGAACCAAACTCTCCCTACGGGCAACATCAGCCTAGGAAGCCCAATCGTCTTTAGCGGTTGGGCACTTCACCTCTTTTCCATCTATAGCTCTGACACGCTCCAAATCCACAAAATGCAAAAATTCTTGATTGATTAGATTGTAGTAAAAATGCGCATGAATTTTTTTATACTCTTCAAGCGTTTTAGTCTCAATCTTTTCTAAAATAGGCTCTAACAGATCCAACTCACCCCCAAACCCTAAAAAACCTTTTAATTCTTCA