annotate rapsodyn/ParseBlastForUniqueMatch.pl @ 10:0a6c1cfe4dc8 draft

Uploaded
author mcharles
date Mon, 19 Jan 2015 04:33:21 -0500
parents 3f7b0788a1c4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
1 #!/usr/bin/perl
10
0a6c1cfe4dc8 Uploaded
mcharles
parents: 7
diff changeset
2 #V1.1.0 manage empty files
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
3 #V1.0.1 added log, option parameters
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
4 use strict;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
5 use warnings;
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
6 use Getopt::Long;
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
7
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
8
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
9 my $input_variant_file;
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
10 my $input_blast_file;
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
11 my $log_file;
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
12 my $WINDOW_LENGTH= 50;
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
13 my $NB_MISMATCH_MAX = 3;
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
14
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
15 GetOptions (
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
16 "input_variant_file=s" => \$input_variant_file,
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
17 "input_blast_file=s" => \$input_blast_file,
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
18 "window_length=s" => \$WINDOW_LENGTH,
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
19 "log_file=s" => \$log_file,
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
20 "nb_mismatch_max=s" => \$NB_MISMATCH_MAX
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
21 ) or die("Error in command line arguments\n");
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
22
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
23 my $nb_variant_checked=0;
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
24 my $nb_variant_selected=0;
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
25 my %hash_name;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
26
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
27 open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n");
10
0a6c1cfe4dc8 Uploaded
mcharles
parents: 7
diff changeset
28 if ( -z INB){
0a6c1cfe4dc8 Uploaded
mcharles
parents: 7
diff changeset
29 exit(0);
0a6c1cfe4dc8 Uploaded
mcharles
parents: 7
diff changeset
30 }
0a6c1cfe4dc8 Uploaded
mcharles
parents: 7
diff changeset
31
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
32 while (my $line =<INB>){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
33 my @fields = split (/\s+/,$line);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
34 # print $#fields,"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
35 # print $line;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
36 # exit(0);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
37 my $query_name;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
38 my $query_start;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
39 my $query_stop;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
40 my $query_aln;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
41 my $subject_aln;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
42 my $compt_mismatch_5p=0;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
43 my $compt_mismatch_3p=0;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
44
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
45 if ($#fields == 24){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
46 $query_name = $fields[0];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
47 $query_start = $fields[6];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
48 $query_stop = $fields[7];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
49 $query_aln = $fields[20];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
50 $subject_aln = $fields[21];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
51 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
52 elsif ($#fields == 4){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
53 $query_name = $fields[0];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
54 $query_start = $fields[1];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
55 $query_stop = $fields[2];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
56 $query_aln = $fields[3];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
57 $subject_aln = $fields[4];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
58 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
59 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
60 print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
61 exit(0);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
62 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
63
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
64
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
65 my @field_name = split (/\_/,$query_name);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
66 my $name = $field_name[0]."_".$field_name[1];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
67 if ($query_name =~ /random/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
68 $name .= "_".$field_name[2];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
69 # print $name."\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
70
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
71 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
72 if (!$hash_name{$name}){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
73 $hash_name{$name}=0;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
74 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
75 elsif ($hash_name{$name}>1){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
76 next;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
77 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
78 # if ($query_name eq "chrUnn_random_8279117_1_M1_a"){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
79 # print "READ : $query_name\n$name\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
80 # print "HASH : ",$hash_name{$name},"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
81 # # exit(0);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
82 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
83
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
84
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
85
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
86 chomp($query_aln);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
87 chomp($subject_aln);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
88
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
89 my $nb_gap_query=0;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
90
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
91 if (length($query_aln) == length($subject_aln)){
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
92 if (length($query_aln)<$WINDOW_LENGTH-$NB_MISMATCH_MAX){
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
93 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
94 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
95 my @q = split(//,$query_aln);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
96 my @s = split(//,$subject_aln);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
97 for (my $i=0;$i<=$#q;$i++){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
98 my $global_idx = $query_start-1+$i-$nb_gap_query;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
99 if ($q[$i] eq "-"){
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
100 if ($global_idx < $WINDOW_LENGTH){
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
101 $compt_mismatch_5p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
102 }
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
103 elsif ($global_idx > $WINDOW_LENGTH){
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
104 $compt_mismatch_3p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
105 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
106 $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
107 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
108 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
109 if ($q[$i] ne $s[$i]){
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
110 if ($global_idx < $WINDOW_LENGTH){
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
111 $compt_mismatch_5p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
112 }
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
113 elsif ($global_idx > $WINDOW_LENGTH){
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
114 $compt_mismatch_3p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
115 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
116 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
117 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
118 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
119 $compt_mismatch_5p += $query_start-1;
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
120 $compt_mismatch_3p += $WINDOW_LENGTH *2 + 1 - $query_stop;
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
121
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
122 # for (my $i=0;$i<$window_length;$i++){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
123 # if ($tbl_q_aln[$i] eq "#"){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
124 # $compt_mismatch_5p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
125 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
126 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
127 # $compt_mismatch_5p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
128 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
129 # else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
130 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
131
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
132 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
133 # for (my $i=$window_length+1;$i<=$window_length*2;$i++){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
134 # if ($tbl_q_aln[$i] eq "#"){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
135 # $compt_mismatch_3p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
136 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
137 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
138 # $compt_mismatch_3p++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
139 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
140 # else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
141 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
142 # }
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
143 if (($compt_mismatch_5p <= $NB_MISMATCH_MAX)||($compt_mismatch_3p <= $NB_MISMATCH_MAX)){
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
144 $hash_name{$name}++;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
145 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
146
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
147 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
148 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
149 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
150 print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
151 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
152
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
153 # if ($line=~/chrCnn_random_49828229/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
154 # print $line;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
155 # print $query_aln,"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
156 # print $subject_aln,"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
157 # print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
158 # print $hash_name{"chrCnn_random_49828229"},"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
159 # print "\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
160 # }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
161
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
162
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
163
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
164 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
165 # exit(0);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
166
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
167 close (INB);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
168
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
169 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
170
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
171 while (my $ligne = <INV>) {
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
172 $nb_variant_checked++;
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
173
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
174 my @champs = split (/\s+/,$ligne);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
175 my $header = $champs[0]."_".$champs[1];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
176
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
177 if ($hash_name{$header}){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
178 if ($hash_name{$header}==1){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
179 print $ligne;
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
180 $nb_variant_selected++;
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
181 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
182 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
183 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
184 #print STDERR "No blast result for ",$header,"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
185 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
186
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
187
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
188 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
189
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
190 close(INV);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
191
7
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
192 open (LF,">$log_file") or die("Can't open $log_file\n");
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
193 print LF "\n####\t Blast filtering \n";
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
194 print LF "Variant checked :\t$nb_variant_checked\n";
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
195 print LF "Variant selected :\t$nb_variant_selected\n";
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
196 close (LF);
3f7b0788a1c4 Uploaded
mcharles
parents: 0
diff changeset
197
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
198
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
199 # foreach my $key (sort keys %hash_name){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
200 # print $key,"\t",$hash_name{$key},"\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
201
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
202 # }