annotate PGAP-1.2.1/blast_parser.pl @ 2:041355b97140 draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 15:01:02 +0000
parents 83e62a1aeeeb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
2 use strict;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
3 use warnings;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
4 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64';
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
5 #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi';
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
6 use XML::Parser;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
7
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
8 ##############################################################################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
9 #
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
10 # This parser parses output files from blastall (blastp) and outputs one-line descriptions
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
11 # for each hit with a score above or equal to a cut-off score that is specified as a parameter
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
12 # to the program. The one-line description contains the following ifomation separated by tab
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
13 # * Query id
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
14 # * Hit id
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
15 # * Bit score
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
16 # * Query length
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
17 # * Hit length
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
18 # * Length of longest segment on query. This is defined as the length of the segment from the
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
19 # first position od the first hsp to the last position of the last hsp. I.e. if the
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
20 # hsps are 1-10 and 90-100, the length of the longest segment will be 100.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
21 # * Length of longest segment on hit, see explanation above.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
22 # * Total match length on query. This is defined as the total length of all hsps. If the hsps
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
23 # are 1-10 and 90-100, the length will be 20.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
24 # * Total match length on hit. see explanation above
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
25 # * Positions for all segments on the query and on the hit. Positions on the query is written
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
26 # as p:1-100 and positions on the hit is writteh as h:1-100. Positions for all hsps are
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
27 # specified separated nt tab. Positions for query and hut for a segment is separated by
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
28 # one whitespace.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
29 #
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
30
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
31 # MODIFICATION: also add %identity as next output field
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
32 # HOWEVER note this is specifically protein sequence identity of the aligned regions, not all of the proteins
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
33
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
34 # If alignment mode is used, the parser prints a > before all data on the one-line description.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
35 # The next two lines will contain the aligned sequences, including gaps. Alignment mode is off
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
36 # by default but can be used if the second parameter sent to the parser is -a. If alignment
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
37 # mode is not used only the score cutoff and the Blast XML file should be sent to the parser.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
38 #
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
39 # NB: This parser was created for blast 2.2.16. If you run earlier versions of blast or
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
40 # blast with -VT to emulate the old behaviour the code in this script must be changed a bit.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
41 # The neccesary changes are few and the first change is to un-comment all lines
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
42 # marked by ##. Lines marked with ## occurs here at the beginning of the script and on a couple
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
43 # of places in the My::TagHandler package (In the Init subroutine). The other change neccessary
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
44 # is to comment/remove all lines marked with #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
45 #
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
46 # Written by Isabella Pekkari 2007-11-19
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
47 # Modified by Kristoffer Forslund 2008-05-07 to add biased composition handling
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
48 #
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
49 ##############################################################################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
50
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
51 # First argument is score cutt-of in bits
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
52 # Second argument is beta threshold for composition filtering. Set to 0.0 to disable.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
53 # Third argument is blast result file (.xml, i.i run blast with option -m7) that shall be parsed
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
54 my $score_cutoff = shift;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
55
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
56
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
57 # If alignment mode shall be used, the flag -a is specified as the first parameter
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
58 # Alignment mode is off by defalut so the flag -a must be specified if this mode shall be used.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
59 my $alignment_mode = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
60 if (defined $ARGV[0] && $ARGV[0] eq '-a') {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
61
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
62 $alignment_mode = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
63 shift;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
64 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
65
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
66 # If segments must be ordered in a linear fashion on both sequences.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
67 # If 1, the hsp order must be the same on the query and the hit.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
68 # For example, if the n-terminal of the hit matches the c-terminal of
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
69 # the query another segment where the c-terminal of the hit matches
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
70 # the n-terminal of the hit si not allowed.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
71 # If this parameter is set to 0, hsps can be ordered differently on the
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
72 # query and the hit. In both cases, an overlap of maximum 5% of the shortest segment
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
73 # is allowed.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
74 my $linear_segment_mode = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
75
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
76 # Only for older versions of blast
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
77 ##my $done = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
78
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
79
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
80
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
81 # Create an xml parser
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
82 my $p = XML::Parser->new( Style => 'My::TagHandler', );
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
83
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
84 # Create an Expat::NB parser
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
85 my $nb_p = $p->parse_start();
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
86
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
87 # Parse the xml file
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
88 parse_document();
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
89
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
90 sub parse_document {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
91
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
92 while(my $l = <>){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
93 ##if ($done) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
94
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
95 ##$nb_p->parse_done;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
96 ##$nb_p = $p->parse_start();
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
97 ##$done = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
98
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
99 ##} else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
100 # Remove whitespace at the end of the line
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
101 chomp($l);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
102 ##/DEBUG
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
103 # print STDERR "###" .$l . "###\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
104 # Parse the line
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
105 $nb_p->parse_more($l);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
106 ##}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
107 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
108
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
109 # Shut the parser down
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
110 $nb_p->parse_done;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
111
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
112 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
113
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
114 # Only used for older versions of blast
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
115 ##sub next_doc {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
116
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
117 ## $done = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
118
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
119 ##}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
120
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
121
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
122
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
123 ############################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
124 # Package for parsing xml files obtained
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
125 # from blast.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
126 ############################################
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
127 package My::TagHandler;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
128
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
129 use strict;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
130 use warnings;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
131
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
132 #Subroutines
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
133 my $create_query;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
134 my $store_query_length;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
135 my $new_hit;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
136 my $save_hit_length;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
137 my $new_hsp;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
138 my $save_hsp_start_query;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
139 my $save_hsp_end_query;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
140 my $save_hsp_start_hit;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
141 my $save_hsp_end_hit;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
142 my $save_hsp_qseq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
143 my $save_hsp_hseq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
144 my $end_hit;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
145 my $print_query;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
146 my $check_if_overlap_non_linear_allowed;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
147 my $check_if_overlap_linear;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
148
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
149 my @current_query; # The current query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
150 my $current_hit; # The current hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
151 my $current_hsp; # The current hsp
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
152
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
153 my $currentText = ''; # Text from the current tag
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
154
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
155 # Names of tags that shall be parsed as key and references to
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
156 # the anonymous subroutine that shall be run when parsing the tag as value.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
157 my %tags_to_parse;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
158
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
159 # Reference to the anonymous subroutine that shall be run
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
160 # to check whether there is an overlap between two hsps.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
161 my $overlap_sub;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
162
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
163 sub Init {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
164
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
165 my($expat) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
166
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
167 #Subroutine for creating new query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
168 $create_query = sub {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
169
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
170 # The the query id. Additional information for the query is ignored.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
171 my @tmp = split(/\s+/, $currentText);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
172 @current_query = ($tmp[0], 0, [])
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
173 };
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
174
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
175 #Subroutine for storing query length
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
176 $store_query_length = sub {$current_query[1] = $currentText};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
177
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
178 #Subroutine for creating new hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
179 $new_hit = sub {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
180
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
181 # The the hit id. Additional information for the hit is ignored.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
182 my @tmp = split(/\s+/, $currentText);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
183 $current_hit = [$tmp[0], 0, 0, 0, 0, [], 0];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
184 };
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
185
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
186 #Subroutine for saving hit length
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
187 $save_hit_length = sub {$current_hit->[1] = $currentText};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
188
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
189 #Subroutine for creating new hsp
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
190 $new_hsp = sub {$current_hsp = [$currentText]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
191
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
192 # Subroutine for saving hsp start on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
193 $save_hsp_start_query = sub {$current_hsp->[1] = $currentText};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
194
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
195 # Subroutine for saving hsp end on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
196 $save_hsp_end_query = sub {$current_hsp->[2] = $currentText};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
197
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
198 # Subroutine for saving hsp start on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
199 $save_hsp_start_hit = sub {$current_hsp->[3] = $currentText};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
200
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
201 # Subroutine for saving hsp end on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
202 $save_hsp_end_hit = sub {$current_hsp->[4] = $currentText;};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
203
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
204 # Subroutine for saving hsp query sequence (as in alignment)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
205 $save_hsp_qseq = sub {$current_hsp->[5] = $currentText;};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
206
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
207 # Subroutine for saving hsp hit sequence (as in alignment)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
208 $save_hsp_hseq = sub {$current_hsp->[6] = $currentText;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
209
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
210 # Check if this hsp overlaps with any of the
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
211 # existing hsps
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
212 my $overlap_found = 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
213 foreach my $existing_hsp (@{$current_hit->[5]}) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
214
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
215 if ($overlap_sub->($current_hsp, $existing_hsp)) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
216 $overlap_found = 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
217 last;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
218 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
219
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
220 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
221
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
222 # If this hsp does not overlap with any hsp it is added
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
223
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
224
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
225 unless ($overlap_found) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
226
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
227 # Add the hsp to the hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
228 push( @{$current_hit->[5]}, $current_hsp);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
229
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
230 # Increase number of hsps for the hit with one
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
231 $current_hit->[6]++;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
232
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
233 # Add the hsp score to the total score
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
234 $current_hit->[2] += $current_hsp->[0];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
235
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
236 # Add the hsp length on the query to the total hsp length on the query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
237 $current_hit->[3] += ($current_hsp->[2] - $current_hsp->[1] + 1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
238
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
239 # Add the hsp length on the hit to the total hsp length on the hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
240 $current_hit->[4] += ($current_hsp->[4] - $current_hsp->[3] + 1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
241 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
242
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
243 };
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
244
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
245 # Subroutine for saving hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
246 $end_hit = sub {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
247
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
248 #Add the hit to the qurrent query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
249 unless ($current_hit->[2] < $score_cutoff ) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
250 push( @{$current_query[2]}, $current_hit );
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
251 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
252 };
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
253
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
254 # Subroutine for printing all hits for a query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
255 $print_query = sub {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
256
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
257 # Sort the hits after score with hit with highest score first
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
258 my @sorted_hits = sort {$b->[2] <=> $a->[2]} @{$current_query[2]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
259
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
260 # For all hits
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
261 foreach my $hit (@sorted_hits) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
262
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
263 if ($alignment_mode) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
264
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
265 # When alignment mode is used, self hits are not printed
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
266 # Therefore, the hit is ignored if it has the same id as the query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
267 next if ($current_query[0] eq $hit->[0]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
268
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
269 # When alignment mode is used, a > is printed at the start of
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
270 # lines containing data, i.e. lines that do not contain sequences
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
271 print ">";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
272 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
273
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
274 print "$current_query[0]\t"; # Print query id
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
275 print "$hit->[0]\t"; # Print hit id
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
276 printf ("%.1f\t", $hit->[2]); # Print bit score
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
277 print "$current_query[1]\t"; # Print query length
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
278 print "$hit->[1]\t"; # Print hit length
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
279
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
280 if ($hit->[6] > 1) { # if more than one segment
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
281
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
282 # Sort hsps on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
283 my @hsps_sorted_on_query = sort {$a->[1] <=> $b->[1]} @{$hit->[5]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
284
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
285 # Sort hsps on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
286 my @hsps_sorted_on_hit = sort {$a->[3] <=> $b->[3]} @{$hit->[5]};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
287
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
288 # Get total segment length on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
289 my $segm_length_query = $hsps_sorted_on_query[@hsps_sorted_on_query - 1]->[2] - $hsps_sorted_on_query[0]->[1] + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
290
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
291 # Get total segment length on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
292 my $segm_length_hit = $hsps_sorted_on_hit[@hsps_sorted_on_hit - 1]->[4] - $hsps_sorted_on_hit[0]->[3] + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
293
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
294 print "$segm_length_query\t"; # Print segment length on query (i.e lengt of the segment started by the first hsp and ended by the last hsp)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
295 print "$segm_length_hit\t"; # Print segment length on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
296 print "$hit->[3]\t$hit->[4]\t"; # Print total length of all hsps on the query and on the match, i.e length of actually matching region
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
297
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
298 # In alignment mode, the aligned sequences shall be printed on the lines following the data line
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
299 my $hsp_qseq = '';
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
300 my $hsp_hseq = '';
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
301
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
302 # Print query and hit segment positions
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
303 foreach my $segment (@hsps_sorted_on_query) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
304
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
305 print "q:$segment->[1]-$segment->[2] ";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
306 print "h:$segment->[3]-$segment->[4]\t";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
307
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
308 if ($alignment_mode) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
309
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
310 # Add the hsp sequences to the total sequence
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
311 $hsp_qseq .= $segment->[5];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
312 $hsp_hseq .= $segment->[6];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
313 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
314
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
315 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
316
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
317 print "\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
318
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
319 if ($alignment_mode) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
320
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
321 # Print sequences
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
322 print "$hsp_qseq\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
323 print "$hsp_hseq\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
324 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
325
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
326 } else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
327 # Get the only hsp that was found for this hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
328 my $segment = $hit->[5]->[0];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
329
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
330 # Get total segment length on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
331 my $segm_length_query = $segment->[2] - $segment->[1] + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
332
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
333 # Get total segment length on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
334 my $segm_length_hit = $segment->[4] - $segment->[3] + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
335
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
336 print "$segm_length_query\t"; # Print segment length on query, i.e. length of this hsp on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
337 print "$segm_length_hit\t"; # Print segment length on hit, i.e. length of this hsp on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
338
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
339 # Print total length of all hsps on the query and on the match, i.e length of actually matching region
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
340 # Sice there is only one segment, these lengths will be the same as the segment lengths printed above.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
341 print "$hit->[3]\t$hit->[4]\t";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
342
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
343 # Print segment positions
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
344
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
345 print "q:$segment->[1]-$segment->[2] ";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
346 print "h:$segment->[3]-$segment->[4]\t";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
347 print "\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
348
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
349 if ($alignment_mode) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
350
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
351 # Print sequences
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
352 print "$segment->[5]\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
353 print "$segment->[6]\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
354 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
355
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
356 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
357 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
358 ##main::next_doc(); #NB! Un-comment for older blast versions
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
359
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
360 };
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
361
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
362 # Subroutine for checking if two hsps overlap.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
363 # When this subroutine is used, non-linear arrangements of the hsps are allowed.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
364 $check_if_overlap_non_linear_allowed = sub {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
365
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
366 my $hsp1 = shift; # One hsp
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
367 my $hsp2 = shift; # Another hsp
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
368
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
369 # Check if there is an overlap.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
370 return (_check_overlap_non_linear_allowed($hsp1->[1], $hsp1->[2], $hsp2->[1], $hsp2->[2])
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
371 || _check_overlap_non_linear_allowed($hsp1->[3], $hsp1->[4], $hsp2->[3], $hsp2->[4]));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
372
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
373 };
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
374
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
375 # Subroutine for checking if two hsps overlap.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
376 # When this subroutine is used, non-linear arrangements of the hsps are not allowed.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
377 $check_if_overlap_linear = sub {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
378
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
379 my $hsp1 = shift; # One hsp
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
380 my $hsp2 = shift; # Another hsp
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
381
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
382 # Get start point for hsp1 on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
383 my $start1_hsp1 = $hsp1->[1];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
384
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
385 # Get start point for hsp2 on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
386 my $start1_hsp2 = $hsp2->[1];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
387
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
388 # The hsp that comes first oon the query (N-terminal hsp)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
389 my $first_hsp;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
390
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
391 # The hsp that comes last on the query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
392 my $last_hsp;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
393
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
394 # Check which hsp is N-teminal.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
395 if ($start1_hsp1 eq $start1_hsp2) { # If the fragments start at the same position, there is an overlap (100% of shorter sequence)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
396 return 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
397 } elsif ($start1_hsp1 < $start1_hsp2) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
398
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
399 $first_hsp = $hsp1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
400 $last_hsp = $hsp2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
401
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
402 } else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
403
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
404 $first_hsp = $hsp2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
405 $last_hsp = $hsp1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
406
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
407 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
408
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
409 # Return whether there is an overlap or not.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
410 return (_check_overlap_linear($first_hsp->[1], $first_hsp->[2], $last_hsp->[1], $last_hsp->[2])
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
411 || _check_overlap_linear($first_hsp->[3], $first_hsp->[4], $last_hsp->[3], $last_hsp->[4]));
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
412 };
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
413
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
414 %tags_to_parse = (
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
415
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
416 ##'BlastOutput_query-def' => $create_query,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
417 ##'BlastOutput_query-def' => $create_query,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
418 ## 'BlastOutput_query-def' => $create_query,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
419 ## 'BlastOutput_query-len' => $store_query_length,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
420 'Iteration_query-def' => $create_query, # Query id #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
421 'Iteration_query-len' => $store_query_length, # Query length #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
422 'Hit_def' => $new_hit, # Hit id
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
423 'Hit_len' => $save_hit_length, # Hit length
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
424 'Hsp_bit-score' => $new_hsp, # Hsp bit score
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
425 'Hsp_query-from' => $save_hsp_start_query, # Start point for hsp on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
426 'Hsp_query-to' => $save_hsp_end_query, # End point for hsp on query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
427 'Hsp_hit-from' => $save_hsp_start_hit, # Start position for hsp on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
428 'Hsp_hit-to' => $save_hsp_end_hit, # End position for hsp on hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
429 'Hsp_qseq' => $save_hsp_qseq, # Hsp query sequence (gapped as in the alignment)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
430 'Hsp_hseq' => $save_hsp_hseq, # Hsp hit sequence (gapped as in the alignment)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
431 'Hit_hsps' => $end_hit, # End of hit
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
432 ##'BlastOutput' => $print_query
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
433 'Iteration' => $print_query # End of query #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
434 );
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
435
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
436 # Set overlap subroutine to use
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
437 if ($linear_segment_mode eq 1) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
438 $overlap_sub = $check_if_overlap_linear;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
439 } else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
440 $overlap_sub = $check_if_overlap_non_linear_allowed;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
441 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
442 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
443
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
444 # Nothing is done when encountering a start tag
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
445 #sub Start
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
446 #{
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
447
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
448 #}
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
449
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
450 sub End {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
451
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
452 my($expat, $tag) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
453
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
454 # If the name of the tag is in the table with names of tags to parse,
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
455 # run the corresponding subroutine
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
456 $tags_to_parse{$tag} && do { $tags_to_parse{$tag}->()};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
457
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
458 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
459
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
460 sub Char
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
461 {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
462 my($expat, $text) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
463 # Save the tag text
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
464 $currentText = $text;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
465 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
466
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
467 sub _check_overlap_linear {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
468
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
469 my ($start1, $end1, $start2, $end2) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
470
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
471 # Length of segment 1
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
472 my $length1 = $end1 - $start1 + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
473
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
474 # Length of segment 2
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
475 my $length2 = $end2 - $start2 + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
476
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
477 # Get the length of the sortest of these segments
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
478 my $shortest_length = ($length1 < $length2)?$length1:$length2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
479
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
480 # Maxumin of 5% overlap (witg regard to the shorter segment) is allowed
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
481 return (($start2 - $end1 - 1) / $shortest_length < - 0.05);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
482 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
483
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
484
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
485
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
486 sub _check_overlap_non_linear_allowed {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
487
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
488 my ($start1, $end1, $start2, $end2) = @_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
489
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
490 # Length of segment 1
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
491 my $length1 = $end1 - $start1 + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
492
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
493 # Length of segment 2
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
494 my $length2 = $end2 - $start2 + 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
495
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
496 # Get the length of the sortest of these segments
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
497 my $shortest_length = ($length1 < $length2)?$length1:$length2;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
498
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
499 if ($start1 eq $start2) { # If the fragment start at the same position, there is an overlap (100% of shorter sequence)
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
500 return 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
501 } elsif ($start1 < $start2) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
502
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
503 if (($start2 - $end1 + 1) / $shortest_length < - 0.05) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
504 return 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
505 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
506
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
507 } else {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
508
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
509 if (($start1 - $end2 + 1) / $shortest_length < - 0.05) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
510 return 1;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
511 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
512
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
513 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
514
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
515 # If the method has not returned yet, there is no overlap
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
516 return 0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
517 }