changeset 0:47b586ab4729 draft default tip

planemo upload commit 4fee4519135f7677cf50f721cf1ad7a7335ad66d-dirty
author nml
date Fri, 06 Apr 2018 14:29:17 -0400 (2018-04-06)
parents
children
files pseudogenome.pl pseudogenome.xml test-data/custom.fasta test-data/input.fasta test-data/output.fasta
diffstat 5 files changed, 384 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pseudogenome.pl	Fri Apr 06 14:29:17 2018 -0400
@@ -0,0 +1,198 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use autodie qw(:all);
+use Bio::SeqIO;
+use Readonly;
+use File::Basename;
+use Getopt::Long;
+use Pod::Usage;
+Getopt::Long::Configure('bundling');
+
+=head1 NAME
+
+nml_pseudogenome.pl - To create a single pseudo genome out of multiple contigs provided in a single fasta file. Contig are combined in order of appearances in file
+
+=head1 SYNOPSIS
+
+nml_pseudogenome.pl -i F<file_name.fna> -n 100 -c X -o F<filename.fna>
+
+=head1 OPTIONS
+
+=over
+
+=item B<-i>, B<--input>
+
+Multiple fasta file
+
+=item B<-n>, B<--number>
+
+Number of filler base pairs to be added, default : 10
+
+=item B<-c>, B<--chars>
+
+Character to be used as the 'glue' between contigs, default : 'N'
+
+=item B<--id>
+
+Name of fasta file to be used default: pseudogenome
+
+=item B<-o>, B<--output>
+
+Output file name, default : Same as input
+
+=item B<-s>, B<--stitch>
+
+Add the stitch pattern between contigs only
+
+=item B<-h>, B<--help>
+
+Print this help
+
+=item EXAMPLE
+
+nml_pseudogenome.pl -i multiple_fasta.fna -n 100 -c X -o pseudo.fna
+
+nml_pseudogenome.pl -i another_multiple.fna
+
+=back
+
+=head1 DESCRIPTION
+
+To create a single pseudo genome out of multiple contigs provided in a single fasta file. Contig are combined in order of appearances in file.
+
+=cut
+
+# Nonsub perlcode
+
+Readonly my $DEFAULT_NUM_CHAR => 10;
+Readonly my $stitch_pattern => 'NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNN';
+Readonly my $DEFAULT_CHAR => 'N';
+my ( $input,$id, $number, $char, $output,$stitch, $help );
+
+GetOptions(
+    'i|input=s'    => \$input,
+    'n|number=s' => \$number,
+    'c|char=s'   => \$char,
+    'o|output=s'   => \$output,
+    'h|help'       => \$help,
+    's|stitch' => \$stitch,
+    'id=s' => \$id
+);
+($input,$id,$number,$char,$output) =check_inputs( $input, $number, $char,$output, $help,$stitch );
+
+
+
+my $in = Bio::SeqIO->new(-file=>$input,-format=>'fasta');
+
+my $sequence;
+
+#go thru every sequence and append to main sequence
+while (my $seq = $in->next_seq()) {
+    if ($stitch) {
+        $sequence .= $seq->seq . $stitch_pattern;
+    }
+    else {
+        $sequence .= $seq->seq . ($char x $number );        
+    }
+
+}
+
+my $main = Bio::Seq->new(-display_id=>$id,-seq=>$sequence);
+
+my $out = Bio::SeqIO->new(-file => ">$output" ,-format=>'fasta');
+$out->write_seq($main);
+
+exit;
+
+=begin HTML
+
+=head2 check_inputs
+
+     Title   : check_inputs
+     Usage   : check_inputs($fasta,$num,$filler,$out_to,$usage);
+     Function: check arguments provided by the user to see if they are usable and more or less correct
+     Returns : Return 1 if all is correct,otherwise 0
+     Args    : $query: Query that we are looking for in the database.  Could be accession number or locus_tag
+               $db: Name of database we are looking for using the query provided
+               $format: Ensure that format was given by user and is valid format
+               $usage: If true, return usage description
+     Throws  : none
+
+=cut
+
+sub check_inputs {
+    my ( $fasta, $num, $filler, $out_to, $usage,$use_stitch ) = @_;
+
+    if ( $help || !( $fasta || $num || $filler || $out_to ) ) {
+	pod2usage();
+        exit;
+    }
+
+    if ( !($fasta) || !( -e $fasta ) ) {
+        print STDERR "Error: Input file not given or does not exist\n";
+	pod2usage();
+        exit;
+    }
+
+    if ($use_stitch) {
+        print "Using stitch pattern\n";
+        
+    }
+    else {
+        if ( !$num ) {
+            $num = $DEFAULT_NUM_CHAR;
+            print STDERR "Number of character not given, using $num\n";
+        }
+        elsif ( !( $num =~ /^\d+$/xms ) ) {
+            print STDERR "Error: Number of character was not a number\n";
+	pod2usage();
+            exit;
+        }
+        
+        if ( !$filler ) {
+            $filler = $DEFAULT_CHAR;
+            print STDERR "No filler character given, using 'N'\n";
+        }
+        
+    }
+
+    if ( !($out_to) ) {
+        $out_to = fileparse($fasta) . ".pseudogenome";
+        print
+          "Output file was not given. Result will be written to '$out_to'\n";
+    }
+    if ( ! $id) {
+        $id = 'pseudogenome';
+    }
+
+    return ( $fasta,$id, $num, $filler, $out_to );
+}
+
+=end HTML
+
+=head1 SEE ALSO
+
+No related files.
+
+=head1 AUTHOR
+
+Philip Mabon, <philip.mabon@canada.ca>
+
+=head1 BUGS
+
+None reported.
+
+=head1 COPYRIGHT & LICENSE 
+
+Copyright (C) 2018 by Public Health Agency of Canada
+
+This program is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.2 or,
+at your option, any later version of Perl 5 you may have available.
+
+=head1 DEVELOPER PAGE
+
+No developer documentation.
+
+=cut
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pseudogenome.xml	Fri Apr 06 14:29:17 2018 -0400
@@ -0,0 +1,73 @@
+<tool id="pseudogenome" name="Create pseudo genome" version="1.0.0">
+  <description>from a fasta file in order of appearance</description>
+  <requirements>
+    <requirement type="package" version="1.6.924">perl-bioperl</requirement>
+    <requirement type="package" version="1.04">perl-readonly</requirement>
+    <requirement type="package" version="2.49">perl-getopt-long</requirement>
+    <requirement type="package" version="1.25">perl-ipc-system-simple</requirement>
+  </requirements>
+  <command detect_errors="exit_code"><![CDATA[
+    perl '$__tool_directory__/pseudogenome.pl' -i '$input'
+    
+    #if $stitch.howto == "jcvi":
+        -s
+    #else
+        -n '$stitch.number' -c '$stitch.glue'
+    #end if
+
+    -o '$output'
+  ]]></command>
+  <inputs>
+    <param name="input" type="data" format="fasta" label="Multi contig fasta file" optional="false"/>
+
+    <conditional name="stitch">
+      <param name="howto" type="select" label="How do you want to merge contigs?">
+        <option selected="true" value="jcvi">JCVI Linker</option>
+        <option value="custom">Custom options</option>
+      </param>
+      <when value="jcvi">
+      </when>
+      <when value="custom">
+	<param name="number" type="integer" value="10" label="Number of filler base pairs" optional="false"/>
+	<param name="glue" type="text" value="N" label="Character inserted between contigs" optional="false"/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="output"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="input.fasta"/>
+      <output name="output" value="output.fasta"/>
+    </test>
+    <test>
+      <param name="input" value="input.fasta"/>
+      <param name="howto" value="custom"/>
+      <param name="number" value="50"/>
+      <param name="glue" value="X"/>
+      <output name="output" value="custom.fasta"/>
+    </test>    
+  </tests>
+  <help>
+
+What it does
+============
+This tool takes in a mult-contig fasta file and converts it into a pseudo genome.
+
+
+
+JCVI Linker
+============
+
+Linker is a 36 base pair sequence which places start and
+stop codons in all 6 reading frames to prevent gene for being predicted across contigs.
+
+Sequence below:
+
+"NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNN"
+
+  </help>
+  <citations>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/custom.fasta	Fri Apr 06 14:29:17 2018 -0400
@@ -0,0 +1,40 @@
+>pseudogenome
+ATGACAAAGCTAATTATTCACTTGGTTTCAGACTCTTCTGTGCAAACTGCAAAACATGCA
+GCAAATTCTGCTCTTGCTCAATTTACTTCTATAAAACAAAAATTGTATCATTGGCCAATG
+ATTAGAAATTGTGAATTACTAAATGAAGTATTAAGTAAAATAGAATCTAAACATGGAATA
+GTATTATACACAATTGCTGATCAAGAACTCCGAAAAACTTTAACAAAATTTTGCTATGAA
+TTAAAAATTCCATGTATTTCTGTAATAGGTAAAATTATTAAAGAAATGTCTGTTTTTTCA
+GGTATTGAAATAGAAAAAGAACAAAATTATAATTATAAATTCGATAAAACTTATTTTGAT
+ACACTCAATGCTATAGATTATGCTATAAGACATGATGATGGACAAATGATTAATGAATTA
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTCAGAATCTG
+ATATAATATTAATAGGTCCTTCTAGAACTTCTAAAACACCGACTTCCGTATTTTTAGCGT
+ATAATGGTTTAAAAGCTGCAAATATTCCTTATGTTTATAATTGTCCATTTCCTGATTTTA
+TAGAAAAGGATATAGATCAATTAGTAGTAGGACTTGTTATTAATCCAAATAGGTTAATTG
+AGATAAGAGAAGCTAGATTAAATTTATTGCAAATTAATGAAAATAAAAGCTATACAGATT
+TTAATATAGTACAAAGAGAGTGCATAGAAGTCAGAAAAATTTGTAATCAAAGAAATTGGC
+CAGTGATTGATGTATCAACCAGATCAATAGAGGAAACAGCAGCTTTAATAATGCGAATAT
+ATTATAATAGAAAAAATAAATATCATAAATAAAAAGATTTTTCATTATTTACAAGTAGAA
+GTGACTAATTTATAATTTTATTTATTGCTTTTCGTTGTTATGAGTTAAAAACTTAATGTC
+GTGTTATAACGAAATTACAACACTCCTTGAATTCGATAGCAATGATATCAATACAACACA
+GAGGATAAATATGGTAAATAACGTAACAGATAGCTCTTTTAAAAATGAAGTACTAGAATC
+GGATTTACCTGTAATGGTTGATTTTTGGGCAGAGTGGTGTGGACCATGTAAAATGTTAAT
+ACCGATAATAGATGAAATCAGTAAAGAATTACAAGATAAAGTAAAAGTACTCAAAATGAA
+TATTGATGAAAATCCTAAAACTCCTTCAGAATATGGTATTXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCGTAGCATTCCAACGATAATGTTGTTTAAA
+AATGGTGAACAAAAAGATACTAAAATAGGTTTGCAACAAAAAAATTCTCTTTTAGATTGG
+ATTAATAAATCTATTTGATATTTATTTTATGTTACTTAACCATTCAAAAGTATTTATAGA
+GATAACAGATGGTTATGTAGAAGGCATAGATGTTCATAAAAGAGCACAGGGTTTAAAGCA
+TTTCTTTTTGAAAAAAGGAGTTTCTCTTTCTCCAACTATACCTATATTAAACAATATTAA
+TTTTTCTTGTTATGAGGGAGAAAAAATAGCTTTTATTGGGAGTAATGGTTCAGGTAAAAG
+TTCACTTCTAAAACTGATTGCTGGAATATATCCATTAAAATCAGGTATAGTAAAAGTTCA
+TGGAGATATTGCTGCAATTATAGATATGGGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXAGTCGGTTTTGAACAAGAACAGACAGGTCGTGAAAATATA
+AAAATGCTAATGCTATATAATAATATGCTAGATAAATATAGCAAAAAAATTGAAAAAGAA
+ATTATAGATTTTTCAGAACTTGGTAGTAAAATTGATTTACCGATAAAAATTTATAGTTCC
+GGTATGTTATCACGCCTTGCTTTTTCTGTATCGATATTTCAGAATCCACAGATTCTATTA
+CTTGATGAAGTTTTTGCAGCAGGTGATAGCTATTTTATAGAGAAATCCCTTAATTTAATG
+AAGAATAAATTTAAAAATACCCCTATTTCAATAATAGTAAGTCATCAAGAAGAAATTATA
+AAAGATAATTGTGATAGATGTATTTTATTAAAAGACGGTCATATTATAGATGATGGGACA
+CCATCAGAAATATTTAAAATCTATAAACAACAAAGTAATAAGGAAATTCATAAATGATAA
+AGTATTTTTTTTCTAAAAAATACTGGAGGGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXX
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.fasta	Fri Apr 06 14:29:17 2018 -0400
@@ -0,0 +1,34 @@
+>contig1
+ATGACAAAGCTAATTATTCACTTGGTTTCAGACTCTTCTGTGCAAACTGCAAAACATGCAGCAAATTCTG
+CTCTTGCTCAATTTACTTCTATAAAACAAAAATTGTATCATTGGCCAATGATTAGAAATTGTGAATTACT
+AAATGAAGTATTAAGTAAAATAGAATCTAAACATGGAATAGTATTATACACAATTGCTGATCAAGAACTC
+CGAAAAACTTTAACAAAATTTTGCTATGAATTAAAAATTCCATGTATTTCTGTAATAGGTAAAATTATTA
+AAGAAATGTCTGTTTTTTCAGGTATTGAAATAGAAAAAGAACAAAATTATAATTATAAATTCGATAAAAC
+TTATTTTGATACACTCAATGCTATAGATTATGCTATAAGACATGATGATGGACAAATGATTAATGAATTA
+>contig2
+TCAGAATCTGATATAATATTAATAGGTCCTTCTAGAACTTCTAAAACACCGACTTCCGTATTTTTAGCGT
+ATAATGGTTTAAAAGCTGCAAATATTCCTTATGTTTATAATTGTCCATTTCCTGATTTTATAGAAAAGGA
+TATAGATCAATTAGTAGTAGGACTTGTTATTAATCCAAATAGGTTAATTGAGATAAGAGAAGCTAGATTA
+AATTTATTGCAAATTAATGAAAATAAAAGCTATACAGATTTTAATATAGTACAAAGAGAGTGCATAGAAG
+TCAGAAAAATTTGTAATCAAAGAAATTGGCCAGTGATTGATGTATCAACCAGATCAATAGAGGAAACAGC
+AGCTTTAATAATGCGAATATATTATAATAGAAAAAATAAATATCATAAATAAAAAGATTTTTCATTATTT
+ACAAGTAGAAGTGACTAATTTATAATTTTATTTATTGCTTTTCGTTGTTATGAGTTAAAAACTTAATGTC
+GTGTTATAACGAAATTACAACACTCCTTGAATTCGATAGCAATGATATCAATACAACACAGAGGATAAAT
+ATGGTAAATAACGTAACAGATAGCTCTTTTAAAAATGAAGTACTAGAATCGGATTTACCTGTAATGGTTG
+ATTTTTGGGCAGAGTGGTGTGGACCATGTAAAATGTTAATACCGATAATAGATGAAATCAGTAAAGAATT
+ACAAGATAAAGTAAAAGTACTCAAAATGAATATTGATGAAAATCCTAAAACTCCTTCAGAATATGGTATT
+>contig3
+CGTAGCATTCCAACGATAATGTTGTTTAAAAATGGTGAACAAAAAGATACTAAAATAGGTTTGCAACAAA
+AAAATTCTCTTTTAGATTGGATTAATAAATCTATTTGATATTTATTTTATGTTACTTAACCATTCAAAAG
+TATTTATAGAGATAACAGATGGTTATGTAGAAGGCATAGATGTTCATAAAAGAGCACAGGGTTTAAAGCA
+TTTCTTTTTGAAAAAAGGAGTTTCTCTTTCTCCAACTATACCTATATTAAACAATATTAATTTTTCTTGT
+TATGAGGGAGAAAAAATAGCTTTTATTGGGAGTAATGGTTCAGGTAAAAGTTCACTTCTAAAACTGATTG
+CTGGAATATATCCATTAAAATCAGGTATAGTAAAAGTTCATGGAGATATTGCTGCAATTATAGATATGGG
+>contig4
+AGTCGGTTTTGAACAAGAACAGACAGGTCGTGAAAATATAAAAATGCTAATGCTATATAATAATATGCTA
+GATAAATATAGCAAAAAAATTGAAAAAGAAATTATAGATTTTTCAGAACTTGGTAGTAAAATTGATTTAC
+CGATAAAAATTTATAGTTCCGGTATGTTATCACGCCTTGCTTTTTCTGTATCGATATTTCAGAATCCACA
+GATTCTATTACTTGATGAAGTTTTTGCAGCAGGTGATAGCTATTTTATAGAGAAATCCCTTAATTTAATG
+AAGAATAAATTTAAAAATACCCCTATTTCAATAATAGTAAGTCATCAAGAAGAAATTATAAAAGATAATT
+GTGATAGATGTATTTTATTAAAAGACGGTCATATTATAGATGATGGGACACCATCAGAAATATTTAAAAT
+CTATAAACAACAAAGTAATAAGGAAATTCATAAATGATAAAGTATTTTTTTTCTAAAAAATACTGGAGGG
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.fasta	Fri Apr 06 14:29:17 2018 -0400
@@ -0,0 +1,39 @@
+>pseudogenome
+ATGACAAAGCTAATTATTCACTTGGTTTCAGACTCTTCTGTGCAAACTGCAAAACATGCA
+GCAAATTCTGCTCTTGCTCAATTTACTTCTATAAAACAAAAATTGTATCATTGGCCAATG
+ATTAGAAATTGTGAATTACTAAATGAAGTATTAAGTAAAATAGAATCTAAACATGGAATA
+GTATTATACACAATTGCTGATCAAGAACTCCGAAAAACTTTAACAAAATTTTGCTATGAA
+TTAAAAATTCCATGTATTTCTGTAATAGGTAAAATTATTAAAGAAATGTCTGTTTTTTCA
+GGTATTGAAATAGAAAAAGAACAAAATTATAATTATAAATTCGATAAAACTTATTTTGAT
+ACACTCAATGCTATAGATTATGCTATAAGACATGATGATGGACAAATGATTAATGAATTA
+NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNNTCAGAATCTGATATAATATTAATA
+GGTCCTTCTAGAACTTCTAAAACACCGACTTCCGTATTTTTAGCGTATAATGGTTTAAAA
+GCTGCAAATATTCCTTATGTTTATAATTGTCCATTTCCTGATTTTATAGAAAAGGATATA
+GATCAATTAGTAGTAGGACTTGTTATTAATCCAAATAGGTTAATTGAGATAAGAGAAGCT
+AGATTAAATTTATTGCAAATTAATGAAAATAAAAGCTATACAGATTTTAATATAGTACAA
+AGAGAGTGCATAGAAGTCAGAAAAATTTGTAATCAAAGAAATTGGCCAGTGATTGATGTA
+TCAACCAGATCAATAGAGGAAACAGCAGCTTTAATAATGCGAATATATTATAATAGAAAA
+AATAAATATCATAAATAAAAAGATTTTTCATTATTTACAAGTAGAAGTGACTAATTTATA
+ATTTTATTTATTGCTTTTCGTTGTTATGAGTTAAAAACTTAATGTCGTGTTATAACGAAA
+TTACAACACTCCTTGAATTCGATAGCAATGATATCAATACAACACAGAGGATAAATATGG
+TAAATAACGTAACAGATAGCTCTTTTAAAAATGAAGTACTAGAATCGGATTTACCTGTAA
+TGGTTGATTTTTGGGCAGAGTGGTGTGGACCATGTAAAATGTTAATACCGATAATAGATG
+AAATCAGTAAAGAATTACAAGATAAAGTAAAAGTACTCAAAATGAATATTGATGAAAATC
+CTAAAACTCCTTCAGAATATGGTATTNNNNNCACACACTTAATTAATTAAGTGTGTGNNN
+NNCGTAGCATTCCAACGATAATGTTGTTTAAAAATGGTGAACAAAAAGATACTAAAATAG
+GTTTGCAACAAAAAAATTCTCTTTTAGATTGGATTAATAAATCTATTTGATATTTATTTT
+ATGTTACTTAACCATTCAAAAGTATTTATAGAGATAACAGATGGTTATGTAGAAGGCATA
+GATGTTCATAAAAGAGCACAGGGTTTAAAGCATTTCTTTTTGAAAAAAGGAGTTTCTCTT
+TCTCCAACTATACCTATATTAAACAATATTAATTTTTCTTGTTATGAGGGAGAAAAAATA
+GCTTTTATTGGGAGTAATGGTTCAGGTAAAAGTTCACTTCTAAAACTGATTGCTGGAATA
+TATCCATTAAAATCAGGTATAGTAAAAGTTCATGGAGATATTGCTGCAATTATAGATATG
+GGNNNNNCACACACTTAATTAATTAAGTGTGTGNNNNNAGTCGGTTTTGAACAAGAACAG
+ACAGGTCGTGAAAATATAAAAATGCTAATGCTATATAATAATATGCTAGATAAATATAGC
+AAAAAAATTGAAAAAGAAATTATAGATTTTTCAGAACTTGGTAGTAAAATTGATTTACCG
+ATAAAAATTTATAGTTCCGGTATGTTATCACGCCTTGCTTTTTCTGTATCGATATTTCAG
+AATCCACAGATTCTATTACTTGATGAAGTTTTTGCAGCAGGTGATAGCTATTTTATAGAG
+AAATCCCTTAATTTAATGAAGAATAAATTTAAAAATACCCCTATTTCAATAATAGTAAGT
+CATCAAGAAGAAATTATAAAAGATAATTGTGATAGATGTATTTTATTAAAAGACGGTCAT
+ATTATAGATGATGGGACACCATCAGAAATATTTAAAATCTATAAACAACAAAGTAATAAG
+GAAATTCATAAATGATAAAGTATTTTTTTTCTAAAAAATACTGGAGGGNNNNNCACACAC
+TTAATTAATTAAGTGTGTGNNNNN