comparison rapsodyn/ParseBlastForUniqueMatch.pl @ 0:442a7c88b886 draft

Uploaded
author mcharles
date Wed, 10 Sep 2014 09:18:15 -0400
parents
children 3f7b0788a1c4
comparison
equal deleted inserted replaced
-1:000000000000 0:442a7c88b886
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4
5
6 my $input_variant_file = $ARGV[0];
7 my $input_blast_file = $ARGV[1];
8 my $window_length = $ARGV[2];
9 my $nb_mismatch_max = $ARGV[3];
10
11 my %hash_name;
12
13 open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n");
14 while (my $line =<INB>){
15 my @fields = split (/\s+/,$line);
16 # print $#fields,"\n";
17 # print $line;
18 # exit(0);
19 my $query_name;
20 my $query_start;
21 my $query_stop;
22 my $query_aln;
23 my $subject_aln;
24 my $compt_mismatch_5p=0;
25 my $compt_mismatch_3p=0;
26
27 if ($#fields == 24){
28 $query_name = $fields[0];
29 $query_start = $fields[6];
30 $query_stop = $fields[7];
31 $query_aln = $fields[20];
32 $subject_aln = $fields[21];
33 }
34 elsif ($#fields == 4){
35 $query_name = $fields[0];
36 $query_start = $fields[1];
37 $query_stop = $fields[2];
38 $query_aln = $fields[3];
39 $subject_aln = $fields[4];
40 }
41 else {
42 print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n";
43 exit(0);
44 }
45
46
47 my @field_name = split (/\_/,$query_name);
48 my $name = $field_name[0]."_".$field_name[1];
49 if ($query_name =~ /random/){
50 $name .= "_".$field_name[2];
51 # print $name."\n";
52
53 }
54 if (!$hash_name{$name}){
55 $hash_name{$name}=0;
56 }
57 elsif ($hash_name{$name}>1){
58 next;
59 }
60 # if ($query_name eq "chrUnn_random_8279117_1_M1_a"){
61 # print "READ : $query_name\n$name\n";
62 # print "HASH : ",$hash_name{$name},"\n";
63 # # exit(0);
64 # }
65
66
67
68 chomp($query_aln);
69 chomp($subject_aln);
70
71 my $nb_gap_query=0;
72
73 if (length($query_aln) == length($subject_aln)){
74 if (length($query_aln)<$window_length-$nb_mismatch_max){
75 }
76 else {
77 my @q = split(//,$query_aln);
78 my @s = split(//,$subject_aln);
79 for (my $i=0;$i<=$#q;$i++){
80 my $global_idx = $query_start-1+$i-$nb_gap_query;
81 if ($q[$i] eq "-"){
82 if ($global_idx < $window_length){
83 $compt_mismatch_5p++;
84 }
85 elsif ($global_idx > $window_length){
86 $compt_mismatch_3p++;
87 }
88 $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global
89 }
90 else {
91 if ($q[$i] ne $s[$i]){
92 if ($global_idx < $window_length){
93 $compt_mismatch_5p++;
94 }
95 elsif ($global_idx > $window_length){
96 $compt_mismatch_3p++;
97 }
98 }
99 }
100 }
101 $compt_mismatch_5p += $query_start-1;
102 $compt_mismatch_3p += $window_length *2 + 1 - $query_stop;
103
104 # for (my $i=0;$i<$window_length;$i++){
105 # if ($tbl_q_aln[$i] eq "#"){
106 # $compt_mismatch_5p++;
107 # }
108 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
109 # $compt_mismatch_5p++;
110 # }
111 # else {
112 # }
113
114 # }
115 # for (my $i=$window_length+1;$i<=$window_length*2;$i++){
116 # if ($tbl_q_aln[$i] eq "#"){
117 # $compt_mismatch_3p++;
118 # }
119 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
120 # $compt_mismatch_3p++;
121 # }
122 # else {
123 # }
124 # }
125 if (($compt_mismatch_5p <= $nb_mismatch_max)||($compt_mismatch_3p <= $nb_mismatch_max)){
126 $hash_name{$name}++;
127 }
128
129 }
130 }
131 else {
132 print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n";
133 }
134
135 # if ($line=~/chrCnn_random_49828229/){
136 # print $line;
137 # print $query_aln,"\n";
138 # print $subject_aln,"\n";
139 # print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n";
140 # print $hash_name{"chrCnn_random_49828229"},"\n";
141 # print "\n";
142 # }
143
144
145
146 }
147 # exit(0);
148
149 close (INB);
150
151 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
152
153 while (my $ligne = <INV>) {
154
155 my @champs = split (/\s+/,$ligne);
156 my $header = $champs[0]."_".$champs[1];
157
158 if ($hash_name{$header}){
159 if ($hash_name{$header}==1){
160 print $ligne;
161 }
162 }
163 else {
164 #print STDERR "No blast result for ",$header,"\n";
165 }
166
167
168 }
169
170 close(INV);
171
172
173 # foreach my $key (sort keys %hash_name){
174 # print $key,"\t",$hash_name{$key},"\n";
175
176 # }