Mercurial > repos > pjbriggs > motif_tools
comparison Scan_IUPAC_output_matches_per_seq.pl @ 2:2f48cf393d25 draft
Add Perl scripts missing from previous upload.
author | pjbriggs |
---|---|
date | Mon, 09 Apr 2018 04:56:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:764d562755bd | 2:2f48cf393d25 |
---|---|
1 #! /usr/bin/perl | |
2 | |
3 use strict; | |
4 use FileHandle; | |
5 use Bio::SeqIO; | |
6 #use Statistics::Descriptive; | |
7 | |
8 ##### | |
9 # Program to count all occurences of a particular REGEX | |
10 # in a file containing mutiple FASTA sequences. | |
11 # 11 September 2003. Ian Donaldson. | |
12 # Revised to convert IUPAC to regex | |
13 # Revised to read a multiple FASTA file | |
14 # was CountRegexGFF_IUPAC_1input_simple_output.pl | |
15 ##### | |
16 | |
17 #### File handles | |
18 my $input = new FileHandle; | |
19 my $output = new FileHandle; | |
20 | |
21 #### Variables | |
22 my $file_number = 0; | |
23 my $count_fwd_regex = 0; | |
24 my $count_rvs_regex = 0; | |
25 my $count_all_regex = 0; | |
26 my $seq_tally = 0; | |
27 my @seq_totals = (); | |
28 | |
29 #### Command line usage | |
30 if(@ARGV != 4) { | |
31 die ("USAGE: | |
32 $0 | |
33 IUPAC | |
34 Multiple FASTA input file | |
35 Output | |
36 Skip palindromic (0=F+R-default|1=F only)\n\n"); | |
37 } | |
38 | |
39 #### Search forward strand only? | |
40 my $skip = $ARGV[3]; | |
41 unless($skip =~ /^[01]$/) { | |
42 die("Only accept 0 or 1 for Skip!\n"); | |
43 } | |
44 | |
45 #### Process IUPAC string | |
46 my $iupac = $ARGV[0]; | |
47 chomp $iupac; | |
48 $iupac = uc($iupac); | |
49 | |
50 if($iupac !~ /^[ACGTRYMKWSBDHVN]+$/) { | |
51 die("A non-IUPAC character was detected in your input string!\n"); | |
52 } | |
53 | |
54 #### Forward strand IUPAC | |
55 my @fwd_iupac_letters = split(//, $iupac); | |
56 my @fwd_regex_list = (); | |
57 | |
58 foreach my $letter (@fwd_iupac_letters) { | |
59 my $converted_iupac = iupac2regex($letter); | |
60 push(@fwd_regex_list, $converted_iupac); | |
61 } | |
62 | |
63 my $fwd_regex = join('', @fwd_regex_list); | |
64 | |
65 | |
66 #### Reverse strand IUPAC | |
67 my $revcomp_iupac = RevCompIUPAC($iupac); | |
68 my @rev_iupac_letters = split(//, $revcomp_iupac); | |
69 my @rev_regex_list = (); | |
70 | |
71 foreach my $letter (@rev_iupac_letters) { | |
72 my $converted_iupac = iupac2regex($letter); | |
73 push(@rev_regex_list, $converted_iupac); | |
74 } | |
75 | |
76 my $rvs_regex = join('', @rev_regex_list); | |
77 | |
78 #### Other variables | |
79 #my $label = $ARGV[3]; | |
80 # | |
81 #if($label !~ /^[\w\d]+$/) { | |
82 # die("A non-letter/number character was detected in your label string!\n"); | |
83 #} | |
84 | |
85 my $length = length($iupac); | |
86 | |
87 #### Open output file | |
88 $output->open(">$ARGV[2]") or die "Could not open output file $ARGV[2]!\n"; | |
89 #$output->print("##gff-version 2\n"); | |
90 | |
91 #if($skip == 0) { | |
92 # $output->print("##Pattern search: $iupac and $revcomp_iupac\n"); | |
93 #} | |
94 | |
95 #else { | |
96 # $output->print("##Pattern search: $iupac\n"); | |
97 #} | |
98 | |
99 #### Work thru FASTA entries in the input file with SeqIO | |
100 my $seqio = Bio::SeqIO->new(-file => "$ARGV[1]" , '-format' => 'Fasta'); | |
101 | |
102 while( my $seqobj = $seqio->next_seq() ) { | |
103 $seq_tally++; | |
104 my $this_seq_tally = 0; | |
105 my $sequence = $seqobj->seq(); # actual sequence as a string | |
106 my $seq_id = $seqobj->id(); # header | |
107 #print(">$seq_id\n$seq\n\n"); | |
108 | |
109 #$output->print(">$seq_id\n"); | |
110 | |
111 #### Clean up $sequence to leave only nucleotides | |
112 #$sequence =~ s/[\s\W\d]//g; | |
113 | |
114 while ($sequence =~ /($fwd_regex)/ig) { | |
115 $this_seq_tally++; | |
116 $count_fwd_regex++; | |
117 $count_all_regex++; | |
118 | |
119 #my $end_position = pos($sequence); | |
120 #my $start_position = $end_position - ($length - 1); | |
121 #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t+\t.\t$label\n"); | |
122 } | |
123 | |
124 #### Count reverse REGEX | |
125 unless($skip == 1) { | |
126 while ($sequence =~ /($rvs_regex)/ig) { | |
127 $this_seq_tally++; | |
128 $count_rvs_regex++; | |
129 $count_all_regex++; | |
130 | |
131 #my $end_position = pos($sequence); | |
132 #my $start_position = $end_position - ($length - 1); | |
133 #$output->print("$seq_id\tRegexSearch\tCNS\t$start_position\t$end_position\t.\t-\t.\t$label\n"); | |
134 } | |
135 } | |
136 | |
137 push(@seq_totals, $this_seq_tally); | |
138 $output->print("$seq_id\t$this_seq_tally\n"); | |
139 } | |
140 | |
141 #### Mean motifs per seq | |
142 #my $stat = Statistics::Descriptive::Full->new(); | |
143 #$stat->add_data(@seq_totals); | |
144 #my $mean = $stat->mean(); | |
145 | |
146 | |
147 #### Print a summary file | |
148 #if($skip == 0) { | |
149 # $output->print("##Forward: $fwd_regex. Reverse: $rvs_regex.\n", | |
150 # "##$count_fwd_regex on the forward strand.\n", | |
151 # "##$count_rvs_regex on the reverse strand.\n", | |
152 # "##$count_all_regex in total.\n", | |
153 # "##$seq_tally sequences. Mean motifs per seq = $mean\n"); | |
154 # | |
155 # print STDOUT "There were $count_all_regex instances of $fwd_regex and $rvs_regex.\n\n"; | |
156 #} | |
157 | |
158 #if($skip == 1) { | |
159 # $output->print("##Forward: $fwd_regex.\n", | |
160 # "##$count_fwd_regex on the forward strand.\n", | |
161 # "##$seq_tally sequences. Mean motifs per seq = $mean\n"); | |
162 # | |
163 # print STDOUT "There were $count_fwd_regex instances of $fwd_regex on the forward strand.\n\n"; | |
164 #} | |
165 | |
166 $output->close; | |
167 | |
168 exit; | |
169 | |
170 sub iupac2regex { | |
171 # Convert IUPAC codes to REGEX | |
172 my $iupac = shift; | |
173 | |
174 #### Series of regexes to convert | |
175 if($iupac =~ /A/) { return 'A' } | |
176 if($iupac =~ /C/) { return 'C' } | |
177 if($iupac =~ /G/) { return 'G' } | |
178 if($iupac =~ /T/) { return 'T' } | |
179 if($iupac =~ /M/) { return '[AC]' } | |
180 if($iupac =~ /R/) { return '[AG]' } | |
181 if($iupac =~ /W/) { return '[AT]' } | |
182 if($iupac =~ /S/) { return '[CG]' } | |
183 if($iupac =~ /Y/) { return '[CT]' } | |
184 if($iupac =~ /K/) { return '[GT]' } | |
185 if($iupac =~ /V/) { return '[ACG]' } | |
186 if($iupac =~ /H/) { return '[ACT]' } | |
187 if($iupac =~ /D/) { return '[AGT]' } | |
188 if($iupac =~ /B/) { return '[CGT]' } | |
189 if($iupac =~ /N/) { return '[ACGT]' } | |
190 | |
191 die("IUPAC not recognised by sub iupac2regex!\n"); | |
192 } | |
193 | |
194 sub RevCompIUPAC { | |
195 my $iupac_string = shift; | |
196 my @converted_list = (); | |
197 | |
198 my @iupac_string_list = split(//, $iupac_string); | |
199 | |
200 @iupac_string_list = reverse(@iupac_string_list); | |
201 | |
202 foreach my $letter (@iupac_string_list) { | |
203 $letter =~ tr/ACGTRYMKWSBDHVN/TGCAYRKMWSVHDBN/; | |
204 push(@converted_list, $letter); | |
205 } | |
206 | |
207 my $joined_list = join('', @converted_list); | |
208 return $joined_list; | |
209 } |