| 2 | 1 #! /usr/bin/perl | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use FileHandle; | 
|  | 5 use Bio::SeqIO; | 
|  | 6 #use Statistics::Descriptive; | 
|  | 7 | 
|  | 8 ##### | 
|  | 9 # Program to count all occurences of a particular REGEX | 
|  | 10 # in a file containing mutiple FASTA sequences. | 
|  | 11 # 11 September 2003. Ian Donaldson. | 
|  | 12 # Revised to convert IUPAC to regex | 
|  | 13 # Revised to read a multiple FASTA file | 
|  | 14 # was CountRegexGFF_IUPAC_1input_simple_output.pl | 
|  | 15 ##### | 
|  | 16 | 
|  | 17 #### File handles | 
|  | 18 my $input = new FileHandle; | 
|  | 19 my $output = new FileHandle; | 
|  | 20 | 
|  | 21 #### Variables | 
|  | 22 my $file_number = 0; | 
|  | 23 my $count_fwd_regex = 0; | 
|  | 24 my $count_rvs_regex = 0; | 
|  | 25 my $count_all_regex = 0; | 
|  | 26 my $seq_tally = 0; | 
|  | 27 my @seq_totals = (); | 
|  | 28 | 
|  | 29 #### Command line usage | 
|  | 30 if(@ARGV != 4) { | 
|  | 31    die ("USAGE: | 
|  | 32    $0 | 
|  | 33    IUPAC | 
|  | 34    Multiple FASTA input file | 
|  | 35    Output | 
|  | 36    Skip palindromic (0=F+R-default|1=F only)\n\n"); | 
|  | 37 } | 
|  | 38 | 
|  | 39 #### Search forward strand only? | 
|  | 40 my $skip = $ARGV[3]; | 
|  | 41 unless($skip =~ /^[01]$/) { | 
|  | 42    die("Only accept 0 or 1 for Skip!\n"); | 
|  | 43 } | 
|  | 44 | 
|  | 45 #### Process IUPAC string | 
|  | 46 my $iupac = $ARGV[0]; | 
|  | 47 chomp $iupac; | 
|  | 48 $iupac = uc($iupac); | 
|  | 49 | 
|  | 50 if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) { | 
|  | 51    die("A non-IUPAC character was detected in your input string!\n"); | 
|  | 52 } | 
|  | 53 | 
|  | 54 #### Forward strand IUPAC | 
|  | 55 my @fwd_iupac_letters = split(//, $iupac); | 
|  | 56 my @fwd_regex_list = (); | 
|  | 57 | 
|  | 58 foreach my $letter (@fwd_iupac_letters) { | 
|  | 59    my $converted_iupac = iupac2regex($letter); | 
|  | 60    push(@fwd_regex_list, $converted_iupac); | 
|  | 61 } | 
|  | 62 | 
|  | 63 my $fwd_regex = join('', @fwd_regex_list); | 
|  | 64 | 
|  | 65 | 
|  | 66 #### Reverse strand IUPAC | 
|  | 67 my $revcomp_iupac = RevCompIUPAC($iupac); | 
|  | 68 my @rev_iupac_letters = split(//, $revcomp_iupac); | 
|  | 69 my @rev_regex_list = (); | 
|  | 70 | 
|  | 71 foreach my $letter (@rev_iupac_letters) { | 
|  | 72    my $converted_iupac = iupac2regex($letter); | 
|  | 73    push(@rev_regex_list, $converted_iupac); | 
|  | 74 } | 
|  | 75 | 
|  | 76 my $rvs_regex = join('', @rev_regex_list); | 
|  | 77 | 
|  | 78 #### Other variables | 
|  | 79 #my $label = $ARGV[3]; | 
|  | 80 # | 
|  | 81 #if($label !~ /^[\w\d]+$/) { | 
|  | 82 #   die("A non-letter/number character was detected in your label string!\n"); | 
|  | 83 #} | 
|  | 84 | 
|  | 85 my $length = length($iupac); | 
|  | 86 | 
|  | 87 #### Open output file | 
|  | 88 $output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n"; | 
|  | 89 #$output->print("##gff-version 2\n"); | 
|  | 90 | 
|  | 91 #if($skip == 0) { | 
|  | 92 #   $output->print("##Pattern search: $iupac and $revcomp_iupac\n"); | 
|  | 93 #} | 
|  | 94 | 
|  | 95 #else { | 
|  | 96 #   $output->print("##Pattern search: $iupac\n"); | 
|  | 97 #} | 
|  | 98 | 
|  | 99 #### Work thru FASTA entries in the input file with SeqIO | 
|  | 100 my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta'); | 
|  | 101 | 
|  | 102 while( my $seqobj = $seqio->next_seq() ) { | 
|  | 103    $seq_tally++; | 
|  | 104    my $this_seq_tally = 0; | 
|  | 105    my $sequence = $seqobj->seq(); # actual sequence as a string | 
|  | 106    my $seq_id = $seqobj->id(); # header | 
|  | 107    #print(">$seq_id\n$seq\n\n"); | 
|  | 108 | 
|  | 109    #$output->print(">$seq_id\n"); | 
|  | 110 | 
|  | 111    #### Clean up $sequence to leave only nucleotides | 
|  | 112    #$sequence =~ s/[\s\W\d]//g; | 
|  | 113 | 
|  | 114    while ($sequence =~ /($fwd_regex)/ig) { | 
|  | 115       $this_seq_tally++; | 
|  | 116       $count_fwd_regex++; | 
|  | 117       $count_all_regex++; | 
|  | 118 | 
|  | 119       #my $end_position = pos($sequence); | 
|  | 120       #my $start_position = $end_position - ($length - 1); | 
|  | 121       #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n"); | 
|  | 122    } | 
|  | 123 | 
|  | 124    #### Count reverse REGEX | 
|  | 125    unless($skip == 1) { | 
|  | 126       while ($sequence =~ /($rvs_regex)/ig) { | 
|  | 127          $this_seq_tally++; | 
|  | 128          $count_rvs_regex++; | 
|  | 129          $count_all_regex++; | 
|  | 130 | 
|  | 131          #my $end_position = pos($sequence); | 
|  | 132          #my $start_position = $end_position - ($length - 1); | 
|  | 133          #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n"); | 
|  | 134       } | 
|  | 135    } | 
|  | 136 | 
|  | 137    push(@seq_totals, $this_seq_tally); | 
|  | 138    $output->print("$seq_id\t$this_seq_tally\n"); | 
|  | 139 } | 
|  | 140 | 
|  | 141 #### Mean motifs per seq | 
|  | 142 #my $stat = Statistics::Descriptive::Full->new(); | 
|  | 143 #$stat->add_data(@seq_totals); | 
|  | 144 #my $mean = $stat->mean(); | 
|  | 145 | 
|  | 146 | 
|  | 147 #### Print a summary file | 
|  | 148 #if($skip == 0) { | 
|  | 149 #   $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n", | 
|  | 150 #                  "##$count_fwd_regex on the forward strand.\n", | 
|  | 151 #                  "##$count_rvs_regex on the reverse strand.\n", | 
|  | 152 #                  "##$count_all_regex in total.\n", | 
|  | 153 #                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n"); | 
|  | 154 # | 
|  | 155 #   print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n\n"; | 
|  | 156 #} | 
|  | 157 | 
|  | 158 #if($skip == 1) { | 
|  | 159 #   $output->print("##Forward: $fwd_regex.\n", | 
|  | 160 #                  "##$count_fwd_regex on the forward strand.\n", | 
|  | 161 #                  "##$seq_tally sequences.  Mean motifs per seq = $mean\n"); | 
|  | 162 # | 
|  | 163 #   print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n\n"; | 
|  | 164 #} | 
|  | 165 | 
|  | 166 $output->close; | 
|  | 167 | 
|  | 168 exit; | 
|  | 169 | 
|  | 170 sub iupac2regex { | 
|  | 171 # Convert IUPAC codes to REGEX | 
|  | 172    my $iupac = shift; | 
|  | 173 | 
|  | 174    #### Series of regexes to convert | 
|  | 175    if($iupac =~ /A/) { return 'A' } | 
|  | 176    if($iupac =~ /C/) { return 'C' } | 
|  | 177    if($iupac =~ /G/) { return 'G' } | 
|  | 178    if($iupac =~ /T/) { return 'T' } | 
|  | 179    if($iupac =~ /M/) { return '[AC]' } | 
|  | 180    if($iupac =~ /R/) { return '[AG]' } | 
|  | 181    if($iupac =~ /W/) { return '[AT]' } | 
|  | 182    if($iupac =~ /S/) { return '[CG]' } | 
|  | 183    if($iupac =~ /Y/) { return '[CT]' } | 
|  | 184    if($iupac =~ /K/) { return '[GT]' } | 
|  | 185    if($iupac =~ /V/) { return '[ACG]' } | 
|  | 186    if($iupac =~ /H/) { return '[ACT]' } | 
|  | 187    if($iupac =~ /D/) { return '[AGT]' } | 
|  | 188    if($iupac =~ /B/) { return '[CGT]' } | 
|  | 189    if($iupac =~ /N/) { return '[ACGT]' } | 
|  | 190 | 
|  | 191    die("IUPAC not recognised by sub iupac2regex!\n"); | 
|  | 192 } | 
|  | 193 | 
|  | 194 sub RevCompIUPAC { | 
|  | 195    my $iupac_string = shift; | 
|  | 196    my @converted_list = (); | 
|  | 197 | 
|  | 198    my @iupac_string_list = split(//, $iupac_string); | 
|  | 199 | 
|  | 200    @iupac_string_list = reverse(@iupac_string_list); | 
|  | 201 | 
|  | 202    foreach my $letter (@iupac_string_list) { | 
|  | 203       $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/; | 
|  | 204       push(@converted_list, $letter); | 
|  | 205    } | 
|  | 206 | 
|  | 207    my $joined_list = join('', @converted_list); | 
|  | 208    return $joined_list; | 
|  | 209 } |