Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/SortFasta.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::SortFasta; # ABSTRACT: sort a fasta file by name =head1 SYNOPSIS sort a fasta file by name use Bio::Roary::SortFasta; my $obj = Bio::Roary::SortFasta->new( input_filename => 'infasta.fa', ); $obj->sort_fasta->replace_input_with_output_file; =cut use Moose; use File::Copy; use Bio::SeqIO; has 'input_filename' => ( is => 'ro', isa => 'Str', required => 1 ); has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' ); has 'make_multiple_of_three' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'remove_nnn_from_end' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'similarity' => ( is => 'rw', isa => 'Num', default => 1 ); has 'sequences_unaligned' => ( is => 'rw', isa => 'Bool', default => 0 ); has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' ); has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' ); sub _build_output_filename { my ($self) = @_; return $self->input_filename . ".sorted.fa"; } sub _build__input_seqio { my ($self) = @_; return Bio::SeqIO->new( -file => $self->input_filename, -format => 'Fasta' ); } sub _build__output_seqio { my ($self) = @_; return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' ); } sub _add_padding_to_make_sequence_length_multiple_of_three { my ( $self, $input_seq ) = @_; my $seq_length = $input_seq->length(); if ( $seq_length % 3 == 1 ) { $input_seq->seq( $input_seq->seq() . "NN" ); } elsif ( $seq_length % 3 == 2 ) { $input_seq->seq( $input_seq->seq() . "N" ); } return $input_seq; } sub _remove_nnn_from_all_sequences { my ( $self, $input_sequences ) = @_; for my $sequence_name ( sort keys %{$input_sequences} ) { my $sequence = $input_sequences->{$sequence_name}->seq(); $sequence =~ s/NNN$//i; $input_sequences->{$sequence_name}->seq($sequence); } return $input_sequences; } sub sort_fasta { my ($self) = @_; my %input_sequences; my $nnn_at_end_of_all_sequences = 1; my $sequence; my $variation_detected = 0; while ( my $input_seq = $self->_input_seqio->next_seq() ) { $sequence = $input_seq->seq if(!defined($sequence)); $self->_add_padding_to_make_sequence_length_multiple_of_three($input_seq) if ( $self->make_multiple_of_three ); $nnn_at_end_of_all_sequences = 0 if ( $nnn_at_end_of_all_sequences == 1 && !( $input_seq->seq() =~ /NNN$/i ) ); $input_sequences{ $input_seq->display_id } = $input_seq; my $factor = $self->_percentage_similarity($sequence, $input_seq->seq); if($factor < $self->similarity) { $self->similarity($factor); } } $self->_remove_nnn_from_all_sequences( \%input_sequences ) if ( $self->remove_nnn_from_end && $nnn_at_end_of_all_sequences ); my $sequence_length = 0; my $sequences_unaligned = 0; for my $sequence_name ( sort keys %input_sequences ) { $sequence_length = $input_sequences{$sequence_name}->length if($sequence_length == 0); $self->sequences_unaligned(1) if($input_sequences{$sequence_name}->length != $sequence_length); $self->_output_seqio->write_seq( $input_sequences{$sequence_name} ); } return $self; } sub replace_input_with_output_file { my ($self) = @_; move( $self->output_filename, $self->input_filename ); return $self; } sub _percentage_similarity { my ($self, $string1, $string2) = @_; my $num_differences = 0; my $string1_length = length($string1); for(my $i = 0; $i < $string1_length && $i< length($string2); $i++) { $num_differences++ if( substr($string1, $i, 1) ne substr($string2, $i, 1)); } return 1 if($num_differences == 0); return 0 if($string1_length == 0); return (1.0 - ($num_differences/$string1_length)); } no Moose; __PACKAGE__->meta->make_immutable; 1;