view Scan_IUPAC_output_each_match.pl @ 3:856008c4a5f3 draft default tip

Version 1.0.2 (updates bioperl to 1.7.2)
author pjbriggs
date Fri, 05 Oct 2018 05:33:31 -0400
parents 2f48cf393d25
children
line wrap: on
line source

#! /usr/bin/perl

use strict;
use FileHandle;
use Bio::SeqIO;
#use Statistics::Descriptive;

#####
# Program to count all occurences of a particular REGEX 
# in a file containing mutiple FASTA sequences.
# 11 September 2003. Ian Donaldson.
# Revised to convert IUPAC to regex
# Revised to read a multiple FASTA file
# was CountRegexGFF_IUPAC_1input_nosummary.pl
#####

#### File handles
my $input = new FileHandle;
my $output = new FileHandle;

#### Variables
my $file_number = 0;
my $count_fwd_regex = 0;
my $count_rvs_regex = 0;
my $count_all_regex = 0;
my $seq_tally = 0;
my @seq_totals = ();

#### Command line usage
if(@ARGV != 5) {
   die ("USAGE: 
   $0
   IUPAC
   Multiple FASTA input file
   Output
   Label
   Skip palindromic (0=F+R-default|1=F only)\n\n");
} 

#### Search forward strand only?
my $skip = $ARGV[4];
unless($skip =~ /^[01]$/) {
   die("Only accept 0 or 1 for Skip!\n");
}

#### Process IUPAC string
my $iupac = $ARGV[0];
chomp $iupac;
$iupac = uc($iupac);

if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) {
   die("A non-IUPAC character was detected in your input string!\n");
}

#### Forward strand IUPAC
my @fwd_iupac_letters = split(//, $iupac); 
my @fwd_regex_list = ();

foreach my $letter (@fwd_iupac_letters) {
   my $converted_iupac = iupac2regex($letter);
   push(@fwd_regex_list, $converted_iupac);
}

my $fwd_regex = join('', @fwd_regex_list);


#### Reverse strand IUPAC
my $revcomp_iupac = RevCompIUPAC($iupac);
my @rev_iupac_letters = split(//, $revcomp_iupac); 
my @rev_regex_list = ();

foreach my $letter (@rev_iupac_letters) {
   my $converted_iupac = iupac2regex($letter);
   push(@rev_regex_list, $converted_iupac);
}

my $rvs_regex = join('', @rev_regex_list);

#### Other variables
my $label = $ARGV[3]; 

if($label !~ /^[\w\d]+$/) {
   die("A non-letter/number character was detected in your label string!\n");
}

my $length = length($iupac);

#### Open output file
$output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n";
#$output->print("##gff-version 2\n");

#if($skip == 0) {
#   $output->print("##Pattern search: $iupac and $revcomp_iupac\n");
#}

#else {
#   $output->print("##Pattern search: $iupac\n");
#}

#### Work thru FASTA entries in the input file with SeqIO
my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta');

while( my $seqobj = $seqio->next_seq() ) {
   $seq_tally++;
   my $this_seq_tally = 0;
   my $sequence = $seqobj->seq(); # actual sequence as a string
   my $seq_id = $seqobj->id(); # header
   #print(">$seq_id\n$seq\n\n");

   #$output->print(">$seq_id\n");

   #### Clean up $sequence to leave only nucleotides
   $sequence =~ s/[\s\W\d]//g;

   while ($sequence =~ /($fwd_regex)/ig) {
      $this_seq_tally++;
      $count_fwd_regex++; 
      $count_all_regex++;
      
      my $end_position = pos($sequence);
      my $start_position = $end_position - ($length - 1);
      $output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n"); 
   }

   #### Count reverse REGEX
   unless($skip == 1) {
      while ($sequence =~ /($rvs_regex)/ig) {
         $this_seq_tally++;
         $count_rvs_regex++;
         $count_all_regex++;

         my $end_position = pos($sequence);
         my $start_position = $end_position - ($length - 1);
         $output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n"); 
      }

   push(@seq_totals, $this_seq_tally);
   #$output->print("$this_seq_tally matches\n");
   }
}

#### Mean motifs per seq
#my $stat = Statistics::Descriptive::Full->new();
#$stat->add_data(@seq_totals); 
#my $mean = $stat->mean();


#### Print a summary file
if($skip == 0) {
#   $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n",
#                  "##$count_fwd_regex on the forward strand.\n",
#                  "##$count_rvs_regex on the reverse strand.\n",
#                  "##$count_all_regex in total.\n",
#                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n");
#
   print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n"; 
}

if($skip == 1) {
#   $output->print("##Forward: $fwd_regex.\n",
#                  "##$count_fwd_regex on the forward strand.\n",
#                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n");
#
   print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n"; 
}

$output->close;

exit;

sub iupac2regex {
# Convert IUPAC codes to REGEX
   my $iupac = shift;
   
   #### Series of regexes to convert 
   if($iupac =~ /A/) { return 'A' }
   if($iupac =~ /C/) { return 'C' }
   if($iupac =~ /G/) { return 'G' }
   if($iupac =~ /T/) { return 'T' }
   if($iupac =~ /M/) { return '[AC]' }
   if($iupac =~ /R/) { return '[AG]' }
   if($iupac =~ /W/) { return '[AT]' }
   if($iupac =~ /S/) { return '[CG]' }
   if($iupac =~ /Y/) { return '[CT]' }
   if($iupac =~ /K/) { return '[GT]' }
   if($iupac =~ /V/) { return '[ACG]' }
   if($iupac =~ /H/) { return '[ACT]' }
   if($iupac =~ /D/) { return '[AGT]' }
   if($iupac =~ /B/) { return '[CGT]' }
   if($iupac =~ /N/) { return '[ACGT]' }

   die("IUPAC not recognised by sub iupac2regex!\n");
}

sub RevCompIUPAC {
   my $iupac_string = shift;
   my @converted_list = ();

   my @iupac_string_list = split(//, $iupac_string); 

   @iupac_string_list = reverse(@iupac_string_list);

   foreach my $letter (@iupac_string_list) {
      $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/;
      push(@converted_list, $letter);
   }

   my $joined_list = join('', @converted_list);
   return $joined_list;
}