Mercurial > repos > petr-novak > repeatrxplorer
comparison lib/detect_LTR_insertion_sites.pl @ 0:1d1b9e1b2e2f draft
Uploaded
| author | petr-novak |
|---|---|
| date | Thu, 19 Dec 2019 10:24:45 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1d1b9e1b2e2f |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # parses ACE files of assembled repeats and detects potential | |
| 4 # LTR borders/insertion sites of LTR-retroelements | |
| 5 | |
| 6 # "site" is a region (size of $window) including TG or CA | |
| 7 # "out" is a region adjacent to the site, presumably representing insertion sites | |
| 8 | |
| 9 # this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl" | |
| 10 # -m default set to 10 | |
| 11 | |
| 12 use Getopt::Std; | |
| 13 | |
| 14 | |
| 15 | |
| 16 | |
| 17 getopt('iowsmdrp'); | |
| 18 if ($opt_i) { | |
| 19 $infile = $opt_i; | |
| 20 } else { | |
| 21 die "-i input_file_name missing\n"; | |
| 22 } | |
| 23 | |
| 24 if ($opt_p) { | |
| 25 $db_PBS = $opt_p; | |
| 26 } else { | |
| 27 die "-p PBS database is missing\n"; | |
| 28 } | |
| 29 | |
| 30 | |
| 31 | |
| 32 | |
| 33 if ($opt_o) { | |
| 34 $outfile = $opt_o; | |
| 35 } else { | |
| 36 die "-o output_file_name missing\n"; | |
| 37 } | |
| 38 if ($opt_w) { | |
| 39 $window = $opt_w; | |
| 40 } else { | |
| 41 $window = 7; | |
| 42 print "window size not set, using default ($window)\n"; | |
| 43 } | |
| 44 if ($opt_s) { | |
| 45 $min_site_depth = $opt_s; # minimal average read depth (over $window) required for the site | |
| 46 } else { | |
| 47 $min_site_depth = 10; | |
| 48 print "min_site_depth not set, using default ($min_site_depth)\n"; | |
| 49 } | |
| 50 if ($opt_m) { | |
| 51 $min_out_masked = $opt_m; # minimal average number of masked reads outside the site (over $window) | |
| 52 } else { | |
| 53 $min_out_masked = 10; | |
| 54 print "min_out_masked not set, using default ($min_out_masked)\n"; | |
| 55 } | |
| 56 if ($opt_d) { | |
| 57 $min_masked_fold_diff = $opt_d; # how many times should the proportion of masked reads "out" be higher than in "site" | |
| 58 } else { | |
| 59 $min_masked_fold_diff = 3; | |
| 60 print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n"; | |
| 61 } | |
| 62 if ($opt_x) { | |
| 63 $max_char_to_masked = $opt_x; # max fold difference between depth in "site" and masked depth "out" | |
| 64 } else { | |
| 65 $max_char_to_masked = 10; | |
| 66 print "max_char_to_masked not set, using default ($max_char_to_masked)\n"; | |
| 67 } | |
| 68 if ($opt_r) { | |
| 69 $extract_region = $opt_r; | |
| 70 } else { | |
| 71 $extract_region = 30; | |
| 72 print "extract_region not set, using default ($extract_region)\n"; | |
| 73 } | |
| 74 | |
| 75 # main | |
| 76 $out_table = $outfile; | |
| 77 $out_LTR = "$outfile.LTR"; | |
| 78 $out_ADJ = "$outfile.ADJ"; | |
| 79 open (IN,$infile) or die; | |
| 80 open (OUT,">$out_table") or die; | |
| 81 open (LTR,">$out_LTR") or die; # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented) | |
| 82 open (ADJ,">$out_ADJ") or die; # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region | |
| 83 print OUT "#Parameters:\n"; | |
| 84 print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n"; | |
| 85 print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n"; | |
| 86 print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n"; | |
| 87 print "Analyzing ACE file...\n"; | |
| 88 $prev = 0; | |
| 89 while ($radek = <IN>) { | |
| 90 $contig_found = &read_contig; | |
| 91 if ($contig_found) { | |
| 92 if ($cl > $prev) { | |
| 93 $prev = $cl; | |
| 94 } | |
| 95 &reconstruct_assembly; | |
| 96 &find_sites; | |
| 97 } | |
| 98 } | |
| 99 close IN; | |
| 100 close OUT; | |
| 101 close LTR; | |
| 102 close ADJ; | |
| 103 print "Running blast against tRNA database...\n"; | |
| 104 &add_PBS_info; # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!) | |
| 105 | |
| 106 $error = system("rm $out_table"); | |
| 107 if ($error) { | |
| 108 print "Error removing $out_table\n"; | |
| 109 } | |
| 110 | |
| 111 sub read_contig { | |
| 112 my ($reads_found,$read_id); | |
| 113 # global variables | |
| 114 $cl = 0; | |
| 115 $contig = 0; | |
| 116 $cont_length = 0; | |
| 117 $reads = 0; # number of reads | |
| 118 $cons = ""; # contig consensus (including gaps *) | |
| 119 %read_starts = (); # starts of reads within assembly | |
| 120 %read_lengths = (); # length of reads in assembly (may contain gaps) | |
| 121 %read_from = (); # start of non-masked part of read sequence (relative to the read) | |
| 122 %read_to = (); # end of non-masked part of read sequence | |
| 123 | |
| 124 do { | |
| 125 if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) { | |
| 126 $cl = $1; $contig = $2; $cont_length = $3; $reads = $4; | |
| 127 while ($radek = <IN> and length($radek) > 1) { | |
| 128 chomp($radek); | |
| 129 $cons .= $radek; | |
| 130 } | |
| 131 do { | |
| 132 if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) { | |
| 133 #print "$1 : $2\n"; | |
| 134 $read_starts{$1} = $2; | |
| 135 } | |
| 136 } while ($radek = <IN> and not $radek =~/^BS \d+ \d+/); | |
| 137 $reads_found = 0; | |
| 138 while ($reads_found < $reads) { # expects previously specified number of reads | |
| 139 $radek = <IN>; # expects RD lines with read ids alternate with QA lines | |
| 140 if ($radek =~/^RD (\S+) (\d+)/) { | |
| 141 $read_id = $1; | |
| 142 $read_lengths{$read_id} = $2; | |
| 143 } | |
| 144 if ($radek =~/^QA (\d+) (\d+)/) { | |
| 145 $read_from{$read_id} = $1; | |
| 146 $read_to{$read_id} = $2; | |
| 147 $reads_found++; | |
| 148 } | |
| 149 } | |
| 150 return 1; | |
| 151 } | |
| 152 } while ($radek = <IN>); | |
| 153 return 0; | |
| 154 } | |
| 155 | |
| 156 sub reconstruct_assembly { | |
| 157 my ($id,$min_start,$max_end,$shift,$f,$poz); | |
| 158 # global variables | |
| 159 @assembly_seq = (); # sequence at each position of assembly; it corresponds to consensus, or regions | |
| 160 # before (-) and after (+) consensus [assembly may be longer than consensus] | |
| 161 @assembly_char = (); # number of assembly positions with non-masked characters (nucleotides or gaps) | |
| 162 @assembly_masked = (); # number of masked positions | |
| 163 $assembly_length = 0; # total length, including - and + regions | |
| 164 | |
| 165 $min_start = 1; $max_end = 1; | |
| 166 foreach $id (keys(%read_starts)) { | |
| 167 if ($min_start > $read_starts{$id}) { | |
| 168 $min_start = $read_starts{$id}; | |
| 169 } | |
| 170 if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) { | |
| 171 $max_end = $read_starts{$id}+$read_lengths{$id}-1; | |
| 172 } | |
| 173 } | |
| 174 if ($min_start < 1) { | |
| 175 $shift = abs($min_start) +1; | |
| 176 $assembly_length = $shift + $max_end; | |
| 177 } else { | |
| 178 $assembly_length = $max_end; | |
| 179 $shift = 0; | |
| 180 } | |
| 181 | |
| 182 for ($f=1;$f<=$shift;$f++) { | |
| 183 $assembly_seq[$f] = "-"; | |
| 184 } | |
| 185 for ($f=1;$f<=$cont_length;$f++) { | |
| 186 $assembly_seq[$f+$shift] = substr($cons,$f-1,1); | |
| 187 } | |
| 188 for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) { | |
| 189 $assembly_seq[$f] = "+"; | |
| 190 } | |
| 191 | |
| 192 for ($f=1;$f<=$assembly_length;$f++) { | |
| 193 $assembly_char[$f] = 0; | |
| 194 $assembly_masked[$f] = 0; | |
| 195 } | |
| 196 foreach $id (keys(%read_starts)) { | |
| 197 for ($f=1;$f<=$read_lengths{$id};$f++) { | |
| 198 $poz = $read_starts{$id} + $shift + $f - 1; | |
| 199 if ($f>=$read_from{$id} and $f<=$read_to{$id}) { | |
| 200 $assembly_char[$poz]++; | |
| 201 } else { | |
| 202 $assembly_masked[$poz]++; | |
| 203 } | |
| 204 } | |
| 205 } | |
| 206 } | |
| 207 | |
| 208 sub revcompl { | |
| 209 my ($input) = @_; | |
| 210 my ($base,$seq,$f); | |
| 211 | |
| 212 $seq = ""; | |
| 213 for ($f=length($input)-1;$f>=0;$f--) { | |
| 214 $base = substr($input,$f,1); | |
| 215 if ($base eq "A") { | |
| 216 $seq .= "T"; | |
| 217 } elsif ($base eq "T") { | |
| 218 $seq .= "A"; | |
| 219 } elsif ($base eq "G") { | |
| 220 $seq .= "C"; | |
| 221 } elsif ($base eq "C") { | |
| 222 $seq .= "G"; | |
| 223 } elsif ($base eq "+" or $base eq "-") { | |
| 224 $seq .= $base; | |
| 225 } else { | |
| 226 $seq .= "N"; | |
| 227 } | |
| 228 } | |
| 229 return $seq; | |
| 230 } | |
| 231 | |
| 232 sub find_sites { | |
| 233 my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos); | |
| 234 my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region); | |
| 235 | |
| 236 # find positions of LTR borders (TG and CA) | |
| 237 @TG = (); # positions of "T" | |
| 238 @CA = (); # positions of "A" | |
| 239 for ($f=1;$f<$assembly_length;$f++) { | |
| 240 if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") { | |
| 241 push(@TG,$f); | |
| 242 } | |
| 243 if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") { | |
| 244 push(@CA,$f+1); | |
| 245 } | |
| 246 } | |
| 247 | |
| 248 foreach $pos (@TG) { | |
| 249 if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) { | |
| 250 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = ""; | |
| 251 for ($f=$pos;$f<$pos+$window;$f++) { | |
| 252 $site_sum_char += $assembly_char[$f]; | |
| 253 $site_sum_masked += $assembly_masked[$f]; | |
| 254 $site_seq .= $assembly_seq[$f]; | |
| 255 } | |
| 256 $out_sum_char = 0; $out_sum_masked = 0; | |
| 257 for ($f=$pos-$window;$f<$pos;$f++) { | |
| 258 $out_sum_char += $assembly_char[$f]; | |
| 259 $out_sum_masked += $assembly_masked[$f]; | |
| 260 } | |
| 261 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site | |
| 262 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site | |
| 263 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char)); | |
| 264 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char)); | |
| 265 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) { | |
| 266 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) { | |
| 267 print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t"; | |
| 268 $region = ""; | |
| 269 for ($f=$pos;$f<=$assembly_length;$f++) { | |
| 270 if ($assembly_seq[$f] ne "*") { | |
| 271 $region .= $assembly_seq[$f]; | |
| 272 } | |
| 273 if (length($region) == $extract_region) { | |
| 274 $f = $assembly_length; # terminate cycle | |
| 275 } | |
| 276 } | |
| 277 print OUT "$region\t"; | |
| 278 print LTR ">CL",$cl,"c".$contig."_TG_$pos\n"; | |
| 279 $region = &revcompl($region); | |
| 280 print LTR "$region\n"; | |
| 281 $region = ""; | |
| 282 for ($f=$pos-1;$f>0;$f=$f-1) { | |
| 283 if ($assembly_seq[$f] ne "*") { | |
| 284 $region = $assembly_seq[$f].$region; | |
| 285 } | |
| 286 if (length($region) == $extract_region) { | |
| 287 $f = 0; # terminate cycle | |
| 288 } | |
| 289 } | |
| 290 print OUT "$region\n"; | |
| 291 print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n"; | |
| 292 $region = &revcompl($region); | |
| 293 print ADJ "$region\n"; | |
| 294 } | |
| 295 } | |
| 296 } | |
| 297 } | |
| 298 | |
| 299 foreach $pos (@CA) { | |
| 300 if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) { | |
| 301 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = ""; | |
| 302 for ($f=$pos-$window+1;$f<=$pos;$f++) { | |
| 303 $site_sum_char += $assembly_char[$f]; | |
| 304 $site_sum_masked += $assembly_masked[$f]; | |
| 305 $site_seq .= $assembly_seq[$f]; | |
| 306 } | |
| 307 $out_sum_char = 0; $out_sum_masked = 0; | |
| 308 for ($f=$pos+1;$f<=$pos+$window;$f++) { | |
| 309 $out_sum_char += $assembly_char[$f]; | |
| 310 $out_sum_masked += $assembly_masked[$f]; | |
| 311 } | |
| 312 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site | |
| 313 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site | |
| 314 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char)); | |
| 315 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char)); | |
| 316 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) { | |
| 317 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) { | |
| 318 print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t"; | |
| 319 $region = ""; | |
| 320 for ($f=$pos;$f>0;$f=$f-1) { | |
| 321 if ($assembly_seq[$f] ne "*") { | |
| 322 $region = $assembly_seq[$f].$region; | |
| 323 } | |
| 324 if (length($region) == $extract_region) { | |
| 325 $f = 0; # terminate cycle | |
| 326 } | |
| 327 } | |
| 328 print OUT "$region\t"; | |
| 329 print LTR ">CL",$cl,"c".$contig."_CA_$pos\n"; | |
| 330 print LTR "$region\n"; | |
| 331 $region = ""; | |
| 332 for ($f=$pos+1;$f<=$assembly_length;$f++) { | |
| 333 if ($assembly_seq[$f] ne "*") { | |
| 334 $region .= $assembly_seq[$f]; | |
| 335 } | |
| 336 if (length($region) == $extract_region) { | |
| 337 $f = $assembly_length; # terminate cycle | |
| 338 } | |
| 339 } | |
| 340 print OUT "$region\n"; | |
| 341 print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n"; | |
| 342 print ADJ "$region\n"; | |
| 343 } | |
| 344 } | |
| 345 } | |
| 346 } | |
| 347 } | |
| 348 | |
| 349 sub add_PBS_info { | |
| 350 my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length); | |
| 351 | |
| 352 $pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F"; | |
| 353 | |
| 354 @table = (); | |
| 355 open (TAB,$out_table) or die; | |
| 356 while ($rad = <TAB>) { | |
| 357 push(@table,$rad); | |
| 358 $tab_length++; | |
| 359 } | |
| 360 close TAB; | |
| 361 | |
| 362 open (BLAST,"$pbs_blast_command |") or die; | |
| 363 $prev_query = ""; | |
| 364 while ($rad = <BLAST>) { | |
| 365 if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) { | |
| 366 if ("$1\t$2\t$3\t$4" ne $prev_query) { # to exclude additional HSPs from the same query/subject pair | |
| 367 for ($f=0;$f<$tab_length;$f++) { | |
| 368 @pol = split(/\t/,$table[$f]); | |
| 369 if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") { | |
| 370 chomp($table[$f]); | |
| 371 $table[$f] .= "\t$rad"; | |
| 372 $f = $tab_length; # terminate cycle | |
| 373 } | |
| 374 } | |
| 375 $prev_query = "$1\t$2\t$3\t$4"; | |
| 376 } | |
| 377 } | |
| 378 } | |
| 379 close BLAST; | |
| 380 | |
| 381 open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die; | |
| 382 for ($f=0;$f<$tab_length;$f++) { | |
| 383 print TAB_WITH_BLAST $table[$f]; | |
| 384 } | |
| 385 close TAB_WITH_BLAST; | |
| 386 } | |
| 387 | |
| 388 | |
| 389 | |
| 390 |
