Mercurial > repos > nml > pseudogenome
changeset 0:47b586ab4729 draft default tip
planemo upload commit 4fee4519135f7677cf50f721cf1ad7a7335ad66d-dirty
author | nml |
---|---|
date | Fri, 06 Apr 2018 14:29:17 -0400 (2018-04-06) |
parents | |
children | |
files | pseudogenome.pl pseudogenome.xml test-data/custom.fasta test-data/input.fasta test-data/output.fasta |
diffstat | 5 files changed, 384 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pseudogenome.pl Fri Apr 06 14:29:17 2018 -0400 @@ -0,0 +1,198 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use autodie qw(:all); +use Bio::SeqIO; +use Readonly; +use File::Basename; +use Getopt::Long; +use Pod::Usage; +Getopt::Long::Configure('bundling'); + +=head1 NAME + +nml_pseudogenome.pl - To create a single pseudo genome out of multiple contigs provided in a single fasta file. Contig are combined in order of appearances in file + +=head1 SYNOPSIS + +nml_pseudogenome.pl -i F<file_name.fna> -n 100 -c X -o F<filename.fna> + +=head1 OPTIONS + +=over + +=item B<-i>, B<--input> + +Multiple fasta file + +=item B<-n>, B<--number> + +Number of filler base pairs to be added, default : 10 + +=item B<-c>, B<--chars> + +Character to be used as the 'glue' between contigs, default : 'N' + +=item B<--id> + +Name of fasta file to be used default: pseudogenome + +=item B<-o>, B<--output> + +Output file name, default : Same as input + +=item B<-s>, B<--stitch> + +Add the stitch pattern between contigs only + +=item B<-h>, B<--help> + +Print this help + +=item EXAMPLE + +nml_pseudogenome.pl -i multiple_fasta.fna -n 100 -c X -o pseudo.fna + +nml_pseudogenome.pl -i another_multiple.fna + +=back + +=head1 DESCRIPTION + +To create a single pseudo genome out of multiple contigs provided in a single fasta file. Contig are combined in order of appearances in file. + +=cut + +# Nonsub perlcode + +Readonly my $DEFAULT_NUM_CHAR => 10; +Readonly my $stitch_pattern => 'NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNN'; +Readonly my $DEFAULT_CHAR => 'N'; +my ( $input,$id, $number, $char, $output,$stitch, $help ); + +GetOptions( + 'i|input=s' => \$input, + 'n|number=s' => \$number, + 'c|char=s' => \$char, + 'o|output=s' => \$output, + 'h|help' => \$help, + 's|stitch' => \$stitch, + 'id=s' => \$id +); +($input,$id,$number,$char,$output) =check_inputs( $input, $number, $char,$output, $help,$stitch ); + + + +my $in = Bio::SeqIO->new(-file=>$input,-format=>'fasta'); + +my $sequence; + +#go thru every sequence and append to main sequence +while (my $seq = $in->next_seq()) { + if ($stitch) { + $sequence .= $seq->seq . $stitch_pattern; + } + else { + $sequence .= $seq->seq . ($char x $number ); + } + +} + +my $main = Bio::Seq->new(-display_id=>$id,-seq=>$sequence); + +my $out = Bio::SeqIO->new(-file => ">$output" ,-format=>'fasta'); +$out->write_seq($main); + +exit; + +=begin HTML + +=head2 check_inputs + + Title : check_inputs + Usage : check_inputs($fasta,$num,$filler,$out_to,$usage); + Function: check arguments provided by the user to see if they are usable and more or less correct + Returns : Return 1 if all is correct,otherwise 0 + Args : $query: Query that we are looking for in the database. Could be accession number or locus_tag + $db: Name of database we are looking for using the query provided + $format: Ensure that format was given by user and is valid format + $usage: If true, return usage description + Throws : none + +=cut + +sub check_inputs { + my ( $fasta, $num, $filler, $out_to, $usage,$use_stitch ) = @_; + + if ( $help || !( $fasta || $num || $filler || $out_to ) ) { + pod2usage(); + exit; + } + + if ( !($fasta) || !( -e $fasta ) ) { + print STDERR "Error: Input file not given or does not exist\n"; + pod2usage(); + exit; + } + + if ($use_stitch) { + print "Using stitch pattern\n"; + + } + else { + if ( !$num ) { + $num = $DEFAULT_NUM_CHAR; + print STDERR "Number of character not given, using $num\n"; + } + elsif ( !( $num =~ /^\d+$/xms ) ) { + print STDERR "Error: Number of character was not a number\n"; + pod2usage(); + exit; + } + + if ( !$filler ) { + $filler = $DEFAULT_CHAR; + print STDERR "No filler character given, using 'N'\n"; + } + + } + + if ( !($out_to) ) { + $out_to = fileparse($fasta) . ".pseudogenome"; + print + "Output file was not given. Result will be written to '$out_to'\n"; + } + if ( ! $id) { + $id = 'pseudogenome'; + } + + return ( $fasta,$id, $num, $filler, $out_to ); +} + +=end HTML + +=head1 SEE ALSO + +No related files. + +=head1 AUTHOR + +Philip Mabon, <philip.mabon@canada.ca> + +=head1 BUGS + +None reported. + +=head1 COPYRIGHT & LICENSE + +Copyright (C) 2018 by Public Health Agency of Canada + +This program is free software; you can redistribute it and/or modify +it under the same terms as Perl itself, either Perl version 5.8.2 or, +at your option, any later version of Perl 5 you may have available. + +=head1 DEVELOPER PAGE + +No developer documentation. + +=cut
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pseudogenome.xml Fri Apr 06 14:29:17 2018 -0400 @@ -0,0 +1,73 @@ +<tool id="pseudogenome" name="Create pseudo genome" version="1.0.0"> + <description>from a fasta file in order of appearance</description> + <requirements> + <requirement type="package" version="1.6.924">perl-bioperl</requirement> + <requirement type="package" version="1.04">perl-readonly</requirement> + <requirement type="package" version="2.49">perl-getopt-long</requirement> + <requirement type="package" version="1.25">perl-ipc-system-simple</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + perl '$__tool_directory__/pseudogenome.pl' -i '$input' + + #if $stitch.howto == "jcvi": + -s + #else + -n '$stitch.number' -c '$stitch.glue' + #end if + + -o '$output' + ]]></command> + <inputs> + <param name="input" type="data" format="fasta" label="Multi contig fasta file" optional="false"/> + + <conditional name="stitch"> + <param name="howto" type="select" label="How do you want to merge contigs?"> + <option selected="true" value="jcvi">JCVI Linker</option> + <option value="custom">Custom options</option> + </param> + <when value="jcvi"> + </when> + <when value="custom"> + <param name="number" type="integer" value="10" label="Number of filler base pairs" optional="false"/> + <param name="glue" type="text" value="N" label="Character inserted between contigs" optional="false"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="fasta" name="output"/> + </outputs> + <tests> + <test> + <param name="input" value="input.fasta"/> + <output name="output" value="output.fasta"/> + </test> + <test> + <param name="input" value="input.fasta"/> + <param name="howto" value="custom"/> + <param name="number" value="50"/> + <param name="glue" value="X"/> + <output name="output" value="custom.fasta"/> + </test> + </tests> + <help> + +What it does +============ +This tool takes in a mult-contig fasta file and converts it into a pseudo genome. + + + +JCVI Linker +============ + +Linker is a 36 base pair sequence which places start and +stop codons in all 6 reading frames to prevent gene for being predicted across contigs. + +Sequence below: + +"NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNN" + + </help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/custom.fasta Fri Apr 06 14:29:17 2018 -0400 @@ -0,0 +1,40 @@ +>pseudogenome +ATGACAAAGCTAATTATTCACTTGGTTTCAGACTCTTCTGTGCAAACTGCAAAACATGCA +GCAAATTCTGCTCTTGCTCAATTTACTTCTATAAAACAAAAATTGTATCATTGGCCAATG +ATTAGAAATTGTGAATTACTAAATGAAGTATTAAGTAAAATAGAATCTAAACATGGAATA +GTATTATACACAATTGCTGATCAAGAACTCCGAAAAACTTTAACAAAATTTTGCTATGAA +TTAAAAATTCCATGTATTTCTGTAATAGGTAAAATTATTAAAGAAATGTCTGTTTTTTCA +GGTATTGAAATAGAAAAAGAACAAAATTATAATTATAAATTCGATAAAACTTATTTTGAT +ACACTCAATGCTATAGATTATGCTATAAGACATGATGATGGACAAATGATTAATGAATTA +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTCAGAATCTG +ATATAATATTAATAGGTCCTTCTAGAACTTCTAAAACACCGACTTCCGTATTTTTAGCGT +ATAATGGTTTAAAAGCTGCAAATATTCCTTATGTTTATAATTGTCCATTTCCTGATTTTA +TAGAAAAGGATATAGATCAATTAGTAGTAGGACTTGTTATTAATCCAAATAGGTTAATTG +AGATAAGAGAAGCTAGATTAAATTTATTGCAAATTAATGAAAATAAAAGCTATACAGATT +TTAATATAGTACAAAGAGAGTGCATAGAAGTCAGAAAAATTTGTAATCAAAGAAATTGGC +CAGTGATTGATGTATCAACCAGATCAATAGAGGAAACAGCAGCTTTAATAATGCGAATAT +ATTATAATAGAAAAAATAAATATCATAAATAAAAAGATTTTTCATTATTTACAAGTAGAA +GTGACTAATTTATAATTTTATTTATTGCTTTTCGTTGTTATGAGTTAAAAACTTAATGTC +GTGTTATAACGAAATTACAACACTCCTTGAATTCGATAGCAATGATATCAATACAACACA +GAGGATAAATATGGTAAATAACGTAACAGATAGCTCTTTTAAAAATGAAGTACTAGAATC +GGATTTACCTGTAATGGTTGATTTTTGGGCAGAGTGGTGTGGACCATGTAAAATGTTAAT +ACCGATAATAGATGAAATCAGTAAAGAATTACAAGATAAAGTAAAAGTACTCAAAATGAA +TATTGATGAAAATCCTAAAACTCCTTCAGAATATGGTATTXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCGTAGCATTCCAACGATAATGTTGTTTAAA +AATGGTGAACAAAAAGATACTAAAATAGGTTTGCAACAAAAAAATTCTCTTTTAGATTGG +ATTAATAAATCTATTTGATATTTATTTTATGTTACTTAACCATTCAAAAGTATTTATAGA +GATAACAGATGGTTATGTAGAAGGCATAGATGTTCATAAAAGAGCACAGGGTTTAAAGCA +TTTCTTTTTGAAAAAAGGAGTTTCTCTTTCTCCAACTATACCTATATTAAACAATATTAA +TTTTTCTTGTTATGAGGGAGAAAAAATAGCTTTTATTGGGAGTAATGGTTCAGGTAAAAG +TTCACTTCTAAAACTGATTGCTGGAATATATCCATTAAAATCAGGTATAGTAAAAGTTCA +TGGAGATATTGCTGCAATTATAGATATGGGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXXAGTCGGTTTTGAACAAGAACAGACAGGTCGTGAAAATATA +AAAATGCTAATGCTATATAATAATATGCTAGATAAATATAGCAAAAAAATTGAAAAAGAA +ATTATAGATTTTTCAGAACTTGGTAGTAAAATTGATTTACCGATAAAAATTTATAGTTCC +GGTATGTTATCACGCCTTGCTTTTTCTGTATCGATATTTCAGAATCCACAGATTCTATTA +CTTGATGAAGTTTTTGCAGCAGGTGATAGCTATTTTATAGAGAAATCCCTTAATTTAATG +AAGAATAAATTTAAAAATACCCCTATTTCAATAATAGTAAGTCATCAAGAAGAAATTATA +AAAGATAATTGTGATAGATGTATTTTATTAAAAGACGGTCATATTATAGATGATGGGACA +CCATCAGAAATATTTAAAATCTATAAACAACAAAGTAATAAGGAAATTCATAAATGATAA +AGTATTTTTTTTCTAAAAAATACTGGAGGGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +XXXXXXXXXXXXXXXXXXXX
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fasta Fri Apr 06 14:29:17 2018 -0400 @@ -0,0 +1,34 @@ +>contig1 +ATGACAAAGCTAATTATTCACTTGGTTTCAGACTCTTCTGTGCAAACTGCAAAACATGCAGCAAATTCTG +CTCTTGCTCAATTTACTTCTATAAAACAAAAATTGTATCATTGGCCAATGATTAGAAATTGTGAATTACT +AAATGAAGTATTAAGTAAAATAGAATCTAAACATGGAATAGTATTATACACAATTGCTGATCAAGAACTC +CGAAAAACTTTAACAAAATTTTGCTATGAATTAAAAATTCCATGTATTTCTGTAATAGGTAAAATTATTA +AAGAAATGTCTGTTTTTTCAGGTATTGAAATAGAAAAAGAACAAAATTATAATTATAAATTCGATAAAAC +TTATTTTGATACACTCAATGCTATAGATTATGCTATAAGACATGATGATGGACAAATGATTAATGAATTA +>contig2 +TCAGAATCTGATATAATATTAATAGGTCCTTCTAGAACTTCTAAAACACCGACTTCCGTATTTTTAGCGT +ATAATGGTTTAAAAGCTGCAAATATTCCTTATGTTTATAATTGTCCATTTCCTGATTTTATAGAAAAGGA +TATAGATCAATTAGTAGTAGGACTTGTTATTAATCCAAATAGGTTAATTGAGATAAGAGAAGCTAGATTA +AATTTATTGCAAATTAATGAAAATAAAAGCTATACAGATTTTAATATAGTACAAAGAGAGTGCATAGAAG +TCAGAAAAATTTGTAATCAAAGAAATTGGCCAGTGATTGATGTATCAACCAGATCAATAGAGGAAACAGC +AGCTTTAATAATGCGAATATATTATAATAGAAAAAATAAATATCATAAATAAAAAGATTTTTCATTATTT +ACAAGTAGAAGTGACTAATTTATAATTTTATTTATTGCTTTTCGTTGTTATGAGTTAAAAACTTAATGTC +GTGTTATAACGAAATTACAACACTCCTTGAATTCGATAGCAATGATATCAATACAACACAGAGGATAAAT +ATGGTAAATAACGTAACAGATAGCTCTTTTAAAAATGAAGTACTAGAATCGGATTTACCTGTAATGGTTG +ATTTTTGGGCAGAGTGGTGTGGACCATGTAAAATGTTAATACCGATAATAGATGAAATCAGTAAAGAATT +ACAAGATAAAGTAAAAGTACTCAAAATGAATATTGATGAAAATCCTAAAACTCCTTCAGAATATGGTATT +>contig3 +CGTAGCATTCCAACGATAATGTTGTTTAAAAATGGTGAACAAAAAGATACTAAAATAGGTTTGCAACAAA +AAAATTCTCTTTTAGATTGGATTAATAAATCTATTTGATATTTATTTTATGTTACTTAACCATTCAAAAG +TATTTATAGAGATAACAGATGGTTATGTAGAAGGCATAGATGTTCATAAAAGAGCACAGGGTTTAAAGCA +TTTCTTTTTGAAAAAAGGAGTTTCTCTTTCTCCAACTATACCTATATTAAACAATATTAATTTTTCTTGT +TATGAGGGAGAAAAAATAGCTTTTATTGGGAGTAATGGTTCAGGTAAAAGTTCACTTCTAAAACTGATTG +CTGGAATATATCCATTAAAATCAGGTATAGTAAAAGTTCATGGAGATATTGCTGCAATTATAGATATGGG +>contig4 +AGTCGGTTTTGAACAAGAACAGACAGGTCGTGAAAATATAAAAATGCTAATGCTATATAATAATATGCTA +GATAAATATAGCAAAAAAATTGAAAAAGAAATTATAGATTTTTCAGAACTTGGTAGTAAAATTGATTTAC +CGATAAAAATTTATAGTTCCGGTATGTTATCACGCCTTGCTTTTTCTGTATCGATATTTCAGAATCCACA +GATTCTATTACTTGATGAAGTTTTTGCAGCAGGTGATAGCTATTTTATAGAGAAATCCCTTAATTTAATG +AAGAATAAATTTAAAAATACCCCTATTTCAATAATAGTAAGTCATCAAGAAGAAATTATAAAAGATAATT +GTGATAGATGTATTTTATTAAAAGACGGTCATATTATAGATGATGGGACACCATCAGAAATATTTAAAAT +CTATAAACAACAAAGTAATAAGGAAATTCATAAATGATAAAGTATTTTTTTTCTAAAAAATACTGGAGGG \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.fasta Fri Apr 06 14:29:17 2018 -0400 @@ -0,0 +1,39 @@ +>pseudogenome +ATGACAAAGCTAATTATTCACTTGGTTTCAGACTCTTCTGTGCAAACTGCAAAACATGCA +GCAAATTCTGCTCTTGCTCAATTTACTTCTATAAAACAAAAATTGTATCATTGGCCAATG +ATTAGAAATTGTGAATTACTAAATGAAGTATTAAGTAAAATAGAATCTAAACATGGAATA +GTATTATACACAATTGCTGATCAAGAACTCCGAAAAACTTTAACAAAATTTTGCTATGAA +TTAAAAATTCCATGTATTTCTGTAATAGGTAAAATTATTAAAGAAATGTCTGTTTTTTCA +GGTATTGAAATAGAAAAAGAACAAAATTATAATTATAAATTCGATAAAACTTATTTTGAT +ACACTCAATGCTATAGATTATGCTATAAGACATGATGATGGACAAATGATTAATGAATTA +NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNNTCAGAATCTGATATAATATTAATA +GGTCCTTCTAGAACTTCTAAAACACCGACTTCCGTATTTTTAGCGTATAATGGTTTAAAA +GCTGCAAATATTCCTTATGTTTATAATTGTCCATTTCCTGATTTTATAGAAAAGGATATA +GATCAATTAGTAGTAGGACTTGTTATTAATCCAAATAGGTTAATTGAGATAAGAGAAGCT +AGATTAAATTTATTGCAAATTAATGAAAATAAAAGCTATACAGATTTTAATATAGTACAA +AGAGAGTGCATAGAAGTCAGAAAAATTTGTAATCAAAGAAATTGGCCAGTGATTGATGTA +TCAACCAGATCAATAGAGGAAACAGCAGCTTTAATAATGCGAATATATTATAATAGAAAA +AATAAATATCATAAATAAAAAGATTTTTCATTATTTACAAGTAGAAGTGACTAATTTATA +ATTTTATTTATTGCTTTTCGTTGTTATGAGTTAAAAACTTAATGTCGTGTTATAACGAAA +TTACAACACTCCTTGAATTCGATAGCAATGATATCAATACAACACAGAGGATAAATATGG +TAAATAACGTAACAGATAGCTCTTTTAAAAATGAAGTACTAGAATCGGATTTACCTGTAA +TGGTTGATTTTTGGGCAGAGTGGTGTGGACCATGTAAAATGTTAATACCGATAATAGATG +AAATCAGTAAAGAATTACAAGATAAAGTAAAAGTACTCAAAATGAATATTGATGAAAATC +CTAAAACTCCTTCAGAATATGGTATTNNNNNCACACACTTAATTAATTAAGTGTGTGNNN +NNCGTAGCATTCCAACGATAATGTTGTTTAAAAATGGTGAACAAAAAGATACTAAAATAG +GTTTGCAACAAAAAAATTCTCTTTTAGATTGGATTAATAAATCTATTTGATATTTATTTT +ATGTTACTTAACCATTCAAAAGTATTTATAGAGATAACAGATGGTTATGTAGAAGGCATA +GATGTTCATAAAAGAGCACAGGGTTTAAAGCATTTCTTTTTGAAAAAAGGAGTTTCTCTT +TCTCCAACTATACCTATATTAAACAATATTAATTTTTCTTGTTATGAGGGAGAAAAAATA +GCTTTTATTGGGAGTAATGGTTCAGGTAAAAGTTCACTTCTAAAACTGATTGCTGGAATA +TATCCATTAAAATCAGGTATAGTAAAAGTTCATGGAGATATTGCTGCAATTATAGATATG +GGNNNNNCACACACTTAATTAATTAAGTGTGTGNNNNNAGTCGGTTTTGAACAAGAACAG +ACAGGTCGTGAAAATATAAAAATGCTAATGCTATATAATAATATGCTAGATAAATATAGC +AAAAAAATTGAAAAAGAAATTATAGATTTTTCAGAACTTGGTAGTAAAATTGATTTACCG +ATAAAAATTTATAGTTCCGGTATGTTATCACGCCTTGCTTTTTCTGTATCGATATTTCAG +AATCCACAGATTCTATTACTTGATGAAGTTTTTGCAGCAGGTGATAGCTATTTTATAGAG +AAATCCCTTAATTTAATGAAGAATAAATTTAAAAATACCCCTATTTCAATAATAGTAAGT +CATCAAGAAGAAATTATAAAAGATAATTGTGATAGATGTATTTTATTAAAAGACGGTCAT +ATTATAGATGATGGGACACCATCAGAAATATTTAAAATCTATAAACAACAAAGTAATAAG +GAAATTCATAAATGATAAAGTATTTTTTTTCTAAAAAATACTGGAGGGNNNNNCACACAC +TTAATTAATTAAGTGTGTGNNNNN