2
|
1 #! /usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4 use FileHandle;
|
|
5 use Bio::SeqIO;
|
|
6 #use Statistics::Descriptive;
|
|
7
|
|
8 #####
|
|
9 # Program to count all occurences of a particular REGEX
|
|
10 # in a file containing mutiple FASTA sequences.
|
|
11 # 11 September 2003. Ian Donaldson.
|
|
12 # Revised to convert IUPAC to regex
|
|
13 # Revised to read a multiple FASTA file
|
|
14 # was CountRegexGFF_IUPAC_1input_simple_output.pl
|
|
15 #####
|
|
16
|
|
17 #### File handles
|
|
18 my $input = new FileHandle;
|
|
19 my $output = new FileHandle;
|
|
20
|
|
21 #### Variables
|
|
22 my $file_number = 0;
|
|
23 my $count_fwd_regex = 0;
|
|
24 my $count_rvs_regex = 0;
|
|
25 my $count_all_regex = 0;
|
|
26 my $seq_tally = 0;
|
|
27 my @seq_totals = ();
|
|
28
|
|
29 #### Command line usage
|
|
30 if(@ARGV != 4) {
|
|
31 die ("USAGE:
|
|
32 $0
|
|
33 IUPAC
|
|
34 Multiple FASTA input file
|
|
35 Output
|
|
36 Skip palindromic (0=F+R-default|1=F only)\n\n");
|
|
37 }
|
|
38
|
|
39 #### Search forward strand only?
|
|
40 my $skip = $ARGV[3];
|
|
41 unless($skip =~ /^[01]$/) {
|
|
42 die("Only accept 0 or 1 for Skip!\n");
|
|
43 }
|
|
44
|
|
45 #### Process IUPAC string
|
|
46 my $iupac = $ARGV[0];
|
|
47 chomp $iupac;
|
|
48 $iupac = uc($iupac);
|
|
49
|
|
50 if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) {
|
|
51 die("A non-IUPAC character was detected in your input string!\n");
|
|
52 }
|
|
53
|
|
54 #### Forward strand IUPAC
|
|
55 my @fwd_iupac_letters = split(//, $iupac);
|
|
56 my @fwd_regex_list = ();
|
|
57
|
|
58 foreach my $letter (@fwd_iupac_letters) {
|
|
59 my $converted_iupac = iupac2regex($letter);
|
|
60 push(@fwd_regex_list, $converted_iupac);
|
|
61 }
|
|
62
|
|
63 my $fwd_regex = join('', @fwd_regex_list);
|
|
64
|
|
65
|
|
66 #### Reverse strand IUPAC
|
|
67 my $revcomp_iupac = RevCompIUPAC($iupac);
|
|
68 my @rev_iupac_letters = split(//, $revcomp_iupac);
|
|
69 my @rev_regex_list = ();
|
|
70
|
|
71 foreach my $letter (@rev_iupac_letters) {
|
|
72 my $converted_iupac = iupac2regex($letter);
|
|
73 push(@rev_regex_list, $converted_iupac);
|
|
74 }
|
|
75
|
|
76 my $rvs_regex = join('', @rev_regex_list);
|
|
77
|
|
78 #### Other variables
|
|
79 #my $label = $ARGV[3];
|
|
80 #
|
|
81 #if($label !~ /^[\w\d]+$/) {
|
|
82 # die("A non-letter/number character was detected in your label string!\n");
|
|
83 #}
|
|
84
|
|
85 my $length = length($iupac);
|
|
86
|
|
87 #### Open output file
|
|
88 $output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n";
|
|
89 #$output->print("##gff-version 2\n");
|
|
90
|
|
91 #if($skip == 0) {
|
|
92 # $output->print("##Pattern search: $iupac and $revcomp_iupac\n");
|
|
93 #}
|
|
94
|
|
95 #else {
|
|
96 # $output->print("##Pattern search: $iupac\n");
|
|
97 #}
|
|
98
|
|
99 #### Work thru FASTA entries in the input file with SeqIO
|
|
100 my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta');
|
|
101
|
|
102 while( my $seqobj = $seqio->next_seq() ) {
|
|
103 $seq_tally++;
|
|
104 my $this_seq_tally = 0;
|
|
105 my $sequence = $seqobj->seq(); # actual sequence as a string
|
|
106 my $seq_id = $seqobj->id(); # header
|
|
107 #print(">$seq_id\n$seq\n\n");
|
|
108
|
|
109 #$output->print(">$seq_id\n");
|
|
110
|
|
111 #### Clean up $sequence to leave only nucleotides
|
|
112 #$sequence =~ s/[\s\W\d]//g;
|
|
113
|
|
114 while ($sequence =~ /($fwd_regex)/ig) {
|
|
115 $this_seq_tally++;
|
|
116 $count_fwd_regex++;
|
|
117 $count_all_regex++;
|
|
118
|
|
119 #my $end_position = pos($sequence);
|
|
120 #my $start_position = $end_position - ($length - 1);
|
|
121 #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n");
|
|
122 }
|
|
123
|
|
124 #### Count reverse REGEX
|
|
125 unless($skip == 1) {
|
|
126 while ($sequence =~ /($rvs_regex)/ig) {
|
|
127 $this_seq_tally++;
|
|
128 $count_rvs_regex++;
|
|
129 $count_all_regex++;
|
|
130
|
|
131 #my $end_position = pos($sequence);
|
|
132 #my $start_position = $end_position - ($length - 1);
|
|
133 #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n");
|
|
134 }
|
|
135 }
|
|
136
|
|
137 push(@seq_totals, $this_seq_tally);
|
|
138 $output->print("$seq_id\t$this_seq_tally\n");
|
|
139 }
|
|
140
|
|
141 #### Mean motifs per seq
|
|
142 #my $stat = Statistics::Descriptive::Full->new();
|
|
143 #$stat->add_data(@seq_totals);
|
|
144 #my $mean = $stat->mean();
|
|
145
|
|
146
|
|
147 #### Print a summary file
|
|
148 #if($skip == 0) {
|
|
149 # $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n",
|
|
150 # "##$count_fwd_regex on the forward strand.\n",
|
|
151 # "##$count_rvs_regex on the reverse strand.\n",
|
|
152 # "##$count_all_regex in total.\n",
|
|
153 # "##$seq_tally sequences. Mean motifs per seq = $mean\n");
|
|
154 #
|
|
155 # print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n\n";
|
|
156 #}
|
|
157
|
|
158 #if($skip == 1) {
|
|
159 # $output->print("##Forward: $fwd_regex.\n",
|
|
160 # "##$count_fwd_regex on the forward strand.\n",
|
|
161 # "##$seq_tally sequences. Mean motifs per seq = $mean\n");
|
|
162 #
|
|
163 # print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n\n";
|
|
164 #}
|
|
165
|
|
166 $output->close;
|
|
167
|
|
168 exit;
|
|
169
|
|
170 sub iupac2regex {
|
|
171 # Convert IUPAC codes to REGEX
|
|
172 my $iupac = shift;
|
|
173
|
|
174 #### Series of regexes to convert
|
|
175 if($iupac =~ /A/) { return 'A' }
|
|
176 if($iupac =~ /C/) { return 'C' }
|
|
177 if($iupac =~ /G/) { return 'G' }
|
|
178 if($iupac =~ /T/) { return 'T' }
|
|
179 if($iupac =~ /M/) { return '[AC]' }
|
|
180 if($iupac =~ /R/) { return '[AG]' }
|
|
181 if($iupac =~ /W/) { return '[AT]' }
|
|
182 if($iupac =~ /S/) { return '[CG]' }
|
|
183 if($iupac =~ /Y/) { return '[CT]' }
|
|
184 if($iupac =~ /K/) { return '[GT]' }
|
|
185 if($iupac =~ /V/) { return '[ACG]' }
|
|
186 if($iupac =~ /H/) { return '[ACT]' }
|
|
187 if($iupac =~ /D/) { return '[AGT]' }
|
|
188 if($iupac =~ /B/) { return '[CGT]' }
|
|
189 if($iupac =~ /N/) { return '[ACGT]' }
|
|
190
|
|
191 die("IUPAC not recognised by sub iupac2regex!\n");
|
|
192 }
|
|
193
|
|
194 sub RevCompIUPAC {
|
|
195 my $iupac_string = shift;
|
|
196 my @converted_list = ();
|
|
197
|
|
198 my @iupac_string_list = split(//, $iupac_string);
|
|
199
|
|
200 @iupac_string_list = reverse(@iupac_string_list);
|
|
201
|
|
202 foreach my $letter (@iupac_string_list) {
|
|
203 $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/;
|
|
204 push(@converted_list, $letter);
|
|
205 }
|
|
206
|
|
207 my $joined_list = join('', @converted_list);
|
|
208 return $joined_list;
|
|
209 }
|