Mercurial > repos > dereeper > pgap
view PGAP-1.2.1/blast_parser.pl @ 2:041355b97140 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 15:01:02 +0000 |
parents | 83e62a1aeeeb |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64'; #use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi'; use XML::Parser; ############################################################################################## # # This parser parses output files from blastall (blastp) and outputs one-line descriptions # for each hit with a score above or equal to a cut-off score that is specified as a parameter # to the program. The one-line description contains the following ifomation separated by tab # * Query id # * Hit id # * Bit score # * Query length # * Hit length # * Length of longest segment on query. This is defined as the length of the segment from the # first position od the first hsp to the last position of the last hsp. I.e. if the # hsps are 1-10 and 90-100, the length of the longest segment will be 100. # * Length of longest segment on hit, see explanation above. # * Total match length on query. This is defined as the total length of all hsps. If the hsps # are 1-10 and 90-100, the length will be 20. # * Total match length on hit. see explanation above # * Positions for all segments on the query and on the hit. Positions on the query is written # as p:1-100 and positions on the hit is writteh as h:1-100. Positions for all hsps are # specified separated nt tab. Positions for query and hut for a segment is separated by # one whitespace. # # MODIFICATION: also add %identity as next output field # HOWEVER note this is specifically protein sequence identity of the aligned regions, not all of the proteins # If alignment mode is used, the parser prints a > before all data on the one-line description. # The next two lines will contain the aligned sequences, including gaps. Alignment mode is off # by default but can be used if the second parameter sent to the parser is -a. If alignment # mode is not used only the score cutoff and the Blast XML file should be sent to the parser. # # NB: This parser was created for blast 2.2.16. If you run earlier versions of blast or # blast with -VT to emulate the old behaviour the code in this script must be changed a bit. # The neccesary changes are few and the first change is to un-comment all lines # marked by ##. Lines marked with ## occurs here at the beginning of the script and on a couple # of places in the My::TagHandler package (In the Init subroutine). The other change neccessary # is to comment/remove all lines marked with #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST. # # Written by Isabella Pekkari 2007-11-19 # Modified by Kristoffer Forslund 2008-05-07 to add biased composition handling # ############################################################################################## # First argument is score cutt-of in bits # Second argument is beta threshold for composition filtering. Set to 0.0 to disable. # Third argument is blast result file (.xml, i.i run blast with option -m7) that shall be parsed my $score_cutoff = shift; # If alignment mode shall be used, the flag -a is specified as the first parameter # Alignment mode is off by defalut so the flag -a must be specified if this mode shall be used. my $alignment_mode = 0; if (defined $ARGV[0] && $ARGV[0] eq '-a') { $alignment_mode = 1; shift; } # If segments must be ordered in a linear fashion on both sequences. # If 1, the hsp order must be the same on the query and the hit. # For example, if the n-terminal of the hit matches the c-terminal of # the query another segment where the c-terminal of the hit matches # the n-terminal of the hit si not allowed. # If this parameter is set to 0, hsps can be ordered differently on the # query and the hit. In both cases, an overlap of maximum 5% of the shortest segment # is allowed. my $linear_segment_mode = 1; # Only for older versions of blast ##my $done = 0; # Create an xml parser my $p = XML::Parser->new( Style => 'My::TagHandler', ); # Create an Expat::NB parser my $nb_p = $p->parse_start(); # Parse the xml file parse_document(); sub parse_document { while(my $l = <>){ ##if ($done) { ##$nb_p->parse_done; ##$nb_p = $p->parse_start(); ##$done = 0; ##} else { # Remove whitespace at the end of the line chomp($l); ##/DEBUG # print STDERR "###" .$l . "###\n"; # Parse the line $nb_p->parse_more($l); ##} } # Shut the parser down $nb_p->parse_done; } # Only used for older versions of blast ##sub next_doc { ## $done = 1; ##} ############################################ # Package for parsing xml files obtained # from blast. ############################################ package My::TagHandler; use strict; use warnings; #Subroutines my $create_query; my $store_query_length; my $new_hit; my $save_hit_length; my $new_hsp; my $save_hsp_start_query; my $save_hsp_end_query; my $save_hsp_start_hit; my $save_hsp_end_hit; my $save_hsp_qseq; my $save_hsp_hseq; my $end_hit; my $print_query; my $check_if_overlap_non_linear_allowed; my $check_if_overlap_linear; my @current_query; # The current query my $current_hit; # The current hit my $current_hsp; # The current hsp my $currentText = ''; # Text from the current tag # Names of tags that shall be parsed as key and references to # the anonymous subroutine that shall be run when parsing the tag as value. my %tags_to_parse; # Reference to the anonymous subroutine that shall be run # to check whether there is an overlap between two hsps. my $overlap_sub; sub Init { my($expat) = @_; #Subroutine for creating new query $create_query = sub { # The the query id. Additional information for the query is ignored. my @tmp = split(/\s+/, $currentText); @current_query = ($tmp[0], 0, []) }; #Subroutine for storing query length $store_query_length = sub {$current_query[1] = $currentText}; #Subroutine for creating new hit $new_hit = sub { # The the hit id. Additional information for the hit is ignored. my @tmp = split(/\s+/, $currentText); $current_hit = [$tmp[0], 0, 0, 0, 0, [], 0]; }; #Subroutine for saving hit length $save_hit_length = sub {$current_hit->[1] = $currentText}; #Subroutine for creating new hsp $new_hsp = sub {$current_hsp = [$currentText]}; # Subroutine for saving hsp start on query $save_hsp_start_query = sub {$current_hsp->[1] = $currentText}; # Subroutine for saving hsp end on query $save_hsp_end_query = sub {$current_hsp->[2] = $currentText}; # Subroutine for saving hsp start on hit $save_hsp_start_hit = sub {$current_hsp->[3] = $currentText}; # Subroutine for saving hsp end on hit $save_hsp_end_hit = sub {$current_hsp->[4] = $currentText;}; # Subroutine for saving hsp query sequence (as in alignment) $save_hsp_qseq = sub {$current_hsp->[5] = $currentText;}; # Subroutine for saving hsp hit sequence (as in alignment) $save_hsp_hseq = sub {$current_hsp->[6] = $currentText; # Check if this hsp overlaps with any of the # existing hsps my $overlap_found = 0; foreach my $existing_hsp (@{$current_hit->[5]}) { if ($overlap_sub->($current_hsp, $existing_hsp)) { $overlap_found = 1; last; } } # If this hsp does not overlap with any hsp it is added unless ($overlap_found) { # Add the hsp to the hit push( @{$current_hit->[5]}, $current_hsp); # Increase number of hsps for the hit with one $current_hit->[6]++; # Add the hsp score to the total score $current_hit->[2] += $current_hsp->[0]; # Add the hsp length on the query to the total hsp length on the query $current_hit->[3] += ($current_hsp->[2] - $current_hsp->[1] + 1); # Add the hsp length on the hit to the total hsp length on the hit $current_hit->[4] += ($current_hsp->[4] - $current_hsp->[3] + 1); } }; # Subroutine for saving hit $end_hit = sub { #Add the hit to the qurrent query unless ($current_hit->[2] < $score_cutoff ) { push( @{$current_query[2]}, $current_hit ); } }; # Subroutine for printing all hits for a query $print_query = sub { # Sort the hits after score with hit with highest score first my @sorted_hits = sort {$b->[2] <=> $a->[2]} @{$current_query[2]}; # For all hits foreach my $hit (@sorted_hits) { if ($alignment_mode) { # When alignment mode is used, self hits are not printed # Therefore, the hit is ignored if it has the same id as the query next if ($current_query[0] eq $hit->[0]); # When alignment mode is used, a > is printed at the start of # lines containing data, i.e. lines that do not contain sequences print ">"; } print "$current_query[0]\t"; # Print query id print "$hit->[0]\t"; # Print hit id printf ("%.1f\t", $hit->[2]); # Print bit score print "$current_query[1]\t"; # Print query length print "$hit->[1]\t"; # Print hit length if ($hit->[6] > 1) { # if more than one segment # Sort hsps on query my @hsps_sorted_on_query = sort {$a->[1] <=> $b->[1]} @{$hit->[5]}; # Sort hsps on hit my @hsps_sorted_on_hit = sort {$a->[3] <=> $b->[3]} @{$hit->[5]}; # Get total segment length on query my $segm_length_query = $hsps_sorted_on_query[@hsps_sorted_on_query - 1]->[2] - $hsps_sorted_on_query[0]->[1] + 1; # Get total segment length on hit my $segm_length_hit = $hsps_sorted_on_hit[@hsps_sorted_on_hit - 1]->[4] - $hsps_sorted_on_hit[0]->[3] + 1; print "$segm_length_query\t"; # Print segment length on query (i.e lengt of the segment started by the first hsp and ended by the last hsp) print "$segm_length_hit\t"; # Print segment length on query print "$hit->[3]\t$hit->[4]\t"; # Print total length of all hsps on the query and on the match, i.e length of actually matching region # In alignment mode, the aligned sequences shall be printed on the lines following the data line my $hsp_qseq = ''; my $hsp_hseq = ''; # Print query and hit segment positions foreach my $segment (@hsps_sorted_on_query) { print "q:$segment->[1]-$segment->[2] "; print "h:$segment->[3]-$segment->[4]\t"; if ($alignment_mode) { # Add the hsp sequences to the total sequence $hsp_qseq .= $segment->[5]; $hsp_hseq .= $segment->[6]; } } print "\n"; if ($alignment_mode) { # Print sequences print "$hsp_qseq\n"; print "$hsp_hseq\n"; } } else { # Get the only hsp that was found for this hit my $segment = $hit->[5]->[0]; # Get total segment length on query my $segm_length_query = $segment->[2] - $segment->[1] + 1; # Get total segment length on hit my $segm_length_hit = $segment->[4] - $segment->[3] + 1; print "$segm_length_query\t"; # Print segment length on query, i.e. length of this hsp on query print "$segm_length_hit\t"; # Print segment length on hit, i.e. length of this hsp on hit # Print total length of all hsps on the query and on the match, i.e length of actually matching region # Sice there is only one segment, these lengths will be the same as the segment lengths printed above. print "$hit->[3]\t$hit->[4]\t"; # Print segment positions print "q:$segment->[1]-$segment->[2] "; print "h:$segment->[3]-$segment->[4]\t"; print "\n"; if ($alignment_mode) { # Print sequences print "$segment->[5]\n"; print "$segment->[6]\n"; } } } ##main::next_doc(); #NB! Un-comment for older blast versions }; # Subroutine for checking if two hsps overlap. # When this subroutine is used, non-linear arrangements of the hsps are allowed. $check_if_overlap_non_linear_allowed = sub { my $hsp1 = shift; # One hsp my $hsp2 = shift; # Another hsp # Check if there is an overlap. return (_check_overlap_non_linear_allowed($hsp1->[1], $hsp1->[2], $hsp2->[1], $hsp2->[2]) || _check_overlap_non_linear_allowed($hsp1->[3], $hsp1->[4], $hsp2->[3], $hsp2->[4])); }; # Subroutine for checking if two hsps overlap. # When this subroutine is used, non-linear arrangements of the hsps are not allowed. $check_if_overlap_linear = sub { my $hsp1 = shift; # One hsp my $hsp2 = shift; # Another hsp # Get start point for hsp1 on query my $start1_hsp1 = $hsp1->[1]; # Get start point for hsp2 on query my $start1_hsp2 = $hsp2->[1]; # The hsp that comes first oon the query (N-terminal hsp) my $first_hsp; # The hsp that comes last on the query my $last_hsp; # Check which hsp is N-teminal. if ($start1_hsp1 eq $start1_hsp2) { # If the fragments start at the same position, there is an overlap (100% of shorter sequence) return 1; } elsif ($start1_hsp1 < $start1_hsp2) { $first_hsp = $hsp1; $last_hsp = $hsp2; } else { $first_hsp = $hsp2; $last_hsp = $hsp1; } # Return whether there is an overlap or not. return (_check_overlap_linear($first_hsp->[1], $first_hsp->[2], $last_hsp->[1], $last_hsp->[2]) || _check_overlap_linear($first_hsp->[3], $first_hsp->[4], $last_hsp->[3], $last_hsp->[4])); }; %tags_to_parse = ( ##'BlastOutput_query-def' => $create_query, ##'BlastOutput_query-def' => $create_query, ## 'BlastOutput_query-def' => $create_query, ## 'BlastOutput_query-len' => $store_query_length, 'Iteration_query-def' => $create_query, # Query id #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST 'Iteration_query-len' => $store_query_length, # Query length #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST 'Hit_def' => $new_hit, # Hit id 'Hit_len' => $save_hit_length, # Hit length 'Hsp_bit-score' => $new_hsp, # Hsp bit score 'Hsp_query-from' => $save_hsp_start_query, # Start point for hsp on query 'Hsp_query-to' => $save_hsp_end_query, # End point for hsp on query 'Hsp_hit-from' => $save_hsp_start_hit, # Start position for hsp on hit 'Hsp_hit-to' => $save_hsp_end_hit, # End position for hsp on hit 'Hsp_qseq' => $save_hsp_qseq, # Hsp query sequence (gapped as in the alignment) 'Hsp_hseq' => $save_hsp_hseq, # Hsp hit sequence (gapped as in the alignment) 'Hit_hsps' => $end_hit, # End of hit ##'BlastOutput' => $print_query 'Iteration' => $print_query # End of query #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST ); # Set overlap subroutine to use if ($linear_segment_mode eq 1) { $overlap_sub = $check_if_overlap_linear; } else { $overlap_sub = $check_if_overlap_non_linear_allowed; } } # Nothing is done when encountering a start tag #sub Start #{ #} sub End { my($expat, $tag) = @_; # If the name of the tag is in the table with names of tags to parse, # run the corresponding subroutine $tags_to_parse{$tag} && do { $tags_to_parse{$tag}->()}; } sub Char { my($expat, $text) = @_; # Save the tag text $currentText = $text; } sub _check_overlap_linear { my ($start1, $end1, $start2, $end2) = @_; # Length of segment 1 my $length1 = $end1 - $start1 + 1; # Length of segment 2 my $length2 = $end2 - $start2 + 1; # Get the length of the sortest of these segments my $shortest_length = ($length1 < $length2)?$length1:$length2; # Maxumin of 5% overlap (witg regard to the shorter segment) is allowed return (($start2 - $end1 - 1) / $shortest_length < - 0.05); } sub _check_overlap_non_linear_allowed { my ($start1, $end1, $start2, $end2) = @_; # Length of segment 1 my $length1 = $end1 - $start1 + 1; # Length of segment 2 my $length2 = $end2 - $start2 + 1; # Get the length of the sortest of these segments my $shortest_length = ($length1 < $length2)?$length1:$length2; if ($start1 eq $start2) { # If the fragment start at the same position, there is an overlap (100% of shorter sequence) return 1; } elsif ($start1 < $start2) { if (($start2 - $end1 + 1) / $shortest_length < - 0.05) { return 1; } } else { if (($start1 - $end2 + 1) / $shortest_length < - 0.05) { return 1; } } # If the method has not returned yet, there is no overlap return 0; }