view pseudogenome.pl @ 0:47b586ab4729 draft default tip

planemo upload commit 4fee4519135f7677cf50f721cf1ad7a7335ad66d-dirty
author nml
date Fri, 06 Apr 2018 14:29:17 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env perl
use strict;
use warnings;
use autodie qw(:all);
use Bio::SeqIO;
use Readonly;
use File::Basename;
use Getopt::Long;
use Pod::Usage;
Getopt::Long::Configure('bundling');

=head1 NAME

nml_pseudogenome.pl - To create a single pseudo genome out of multiple contigs provided in a single fasta file. Contig are combined in order of appearances in file

=head1 SYNOPSIS

nml_pseudogenome.pl -i F<file_name.fna> -n 100 -c X -o F<filename.fna>

=head1 OPTIONS

=over

=item B<-i>, B<--input>

Multiple fasta file

=item B<-n>, B<--number>

Number of filler base pairs to be added, default : 10

=item B<-c>, B<--chars>

Character to be used as the 'glue' between contigs, default : 'N'

=item B<--id>

Name of fasta file to be used default: pseudogenome

=item B<-o>, B<--output>

Output file name, default : Same as input

=item B<-s>, B<--stitch>

Add the stitch pattern between contigs only

=item B<-h>, B<--help>

Print this help

=item EXAMPLE

nml_pseudogenome.pl -i multiple_fasta.fna -n 100 -c X -o pseudo.fna

nml_pseudogenome.pl -i another_multiple.fna

=back

=head1 DESCRIPTION

To create a single pseudo genome out of multiple contigs provided in a single fasta file. Contig are combined in order of appearances in file.

=cut

# Nonsub perlcode

Readonly my $DEFAULT_NUM_CHAR => 10;
Readonly my $stitch_pattern => 'NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNN';
Readonly my $DEFAULT_CHAR => 'N';
my ( $input,$id, $number, $char, $output,$stitch, $help );

GetOptions(
    'i|input=s'    => \$input,
    'n|number=s' => \$number,
    'c|char=s'   => \$char,
    'o|output=s'   => \$output,
    'h|help'       => \$help,
    's|stitch' => \$stitch,
    'id=s' => \$id
);
($input,$id,$number,$char,$output) =check_inputs( $input, $number, $char,$output, $help,$stitch );



my $in = Bio::SeqIO->new(-file=>$input,-format=>'fasta');

my $sequence;

#go thru every sequence and append to main sequence
while (my $seq = $in->next_seq()) {
    if ($stitch) {
        $sequence .= $seq->seq . $stitch_pattern;
    }
    else {
        $sequence .= $seq->seq . ($char x $number );        
    }

}

my $main = Bio::Seq->new(-display_id=>$id,-seq=>$sequence);

my $out = Bio::SeqIO->new(-file => ">$output" ,-format=>'fasta');
$out->write_seq($main);

exit;

=begin HTML

=head2 check_inputs

     Title   : check_inputs
     Usage   : check_inputs($fasta,$num,$filler,$out_to,$usage);
     Function: check arguments provided by the user to see if they are usable and more or less correct
     Returns : Return 1 if all is correct,otherwise 0
     Args    : $query: Query that we are looking for in the database.  Could be accession number or locus_tag
               $db: Name of database we are looking for using the query provided
               $format: Ensure that format was given by user and is valid format
               $usage: If true, return usage description
     Throws  : none

=cut

sub check_inputs {
    my ( $fasta, $num, $filler, $out_to, $usage,$use_stitch ) = @_;

    if ( $help || !( $fasta || $num || $filler || $out_to ) ) {
	pod2usage();
        exit;
    }

    if ( !($fasta) || !( -e $fasta ) ) {
        print STDERR "Error: Input file not given or does not exist\n";
	pod2usage();
        exit;
    }

    if ($use_stitch) {
        print "Using stitch pattern\n";
        
    }
    else {
        if ( !$num ) {
            $num = $DEFAULT_NUM_CHAR;
            print STDERR "Number of character not given, using $num\n";
        }
        elsif ( !( $num =~ /^\d+$/xms ) ) {
            print STDERR "Error: Number of character was not a number\n";
	pod2usage();
            exit;
        }
        
        if ( !$filler ) {
            $filler = $DEFAULT_CHAR;
            print STDERR "No filler character given, using 'N'\n";
        }
        
    }

    if ( !($out_to) ) {
        $out_to = fileparse($fasta) . ".pseudogenome";
        print
          "Output file was not given. Result will be written to '$out_to'\n";
    }
    if ( ! $id) {
        $id = 'pseudogenome';
    }

    return ( $fasta,$id, $num, $filler, $out_to );
}

=end HTML

=head1 SEE ALSO

No related files.

=head1 AUTHOR

Philip Mabon, <philip.mabon@canada.ca>

=head1 BUGS

None reported.

=head1 COPYRIGHT & LICENSE 

Copyright (C) 2018 by Public Health Agency of Canada

This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.2 or,
at your option, any later version of Perl 5 you may have available.

=head1 DEVELOPER PAGE

No developer documentation.

=cut