| 
4
 | 
     1 #!/usr/bin/perl -w
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 use Getopt::Std;
 | 
| 
 | 
     6 my $version = '0.5_galaxy';
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 my $SAMTOOLS_BIN_DIR="/home/cbl/Downloads/samtools-1.9/";
 | 
| 
 | 
     9 
 | 
| 
 | 
    10 my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." );
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY 
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory";
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 my $pt_bad_mates_file=$opts{b};  #GALAXY 
 | 
| 
 | 
    17 my $pt_log_file=$opts{l}; #GALAXY 
 | 
| 
 | 
    18 my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY 
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 die(qq/
 | 
| 
 | 
    22     
 | 
| 
 | 
    23 Description:
 | 
| 
 | 
    24     
 | 
| 
 | 
    25     Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input
 | 
| 
 | 
    26     for SVDetect.
 | 
| 
 | 
    27 
 | 
| 
 | 
    28     From all pairs mapped onto the reference genome, this script outputs abnormal pairs:
 | 
| 
 | 
    29         - mapped on two different chromosomes
 | 
| 
 | 
    30         - with an incorrect strand orientation and\/or pair order
 | 
| 
 | 
    31         - with an insert size distance +- sigma threshold
 | 
| 
 | 
    32     into a file <prefix.ab.bam\/sam>
 | 
| 
 | 
    33     
 | 
| 
 | 
    34     -BAM\/SAM File input format only, sorted by read names
 | 
| 
 | 
    35 
 | 
| 
 | 
    36     Version : $version
 | 
| 
 | 
    37     SAMtools required for BAM files
 | 
| 
 | 
    38     
 | 
| 
 | 
    39     
 | 
| 
 | 
    40 Usage:   BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\/sam>
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 Options: -t BOOLEAN   read type: =1 (Illumina), =0 (SOLiD) [$opts{t}]
 | 
| 
 | 
    43          -p BOOLEAN   pair type: =1 (paired-end), =0 (mate-pair)  [$opts{p}]
 | 
| 
 | 
    44          -n INTEGER   number of pairs for calculating mu and sigma lengths [$opts{n}]
 | 
| 
 | 
    45 	 -s INTEGER   minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}]
 | 
| 
 | 
    46 	 -S INTEGER   maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}]
 | 
| 
 | 
    47          -f REAL      minimal number of sigma fold for filtering pairs [$opts{f}]
 | 
| 
 | 
    48          -d           dump normal pairs into a file [<prefix.norm.bam\/sam>] (optional)
 | 
| 
 | 
    49 	 -o STRING    output directory [$working_dir]
 | 
| 
 | 
    50 
 | 
| 
 | 
    51 \n/) if (@ARGV == 0 && -t STDIN);
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 unless (-d $opts{o}){
 | 
| 
 | 
    54 	mkdir $opts{o} or die;
 | 
| 
 | 
    55 }
 | 
| 
 | 
    56 $opts{o}.="/" if($opts{o}!~/\/$/);
 | 
| 
 | 
    57 
 | 
| 
 | 
    58 my $mates_file=shift(@ARGV);
 | 
| 
 | 
    59 
 | 
| 
 | 
    60 $mates_file=readlink($mates_file);
 | 
| 
 | 
    61 
 | 
| 
 | 
    62 my $bad_mates_file=(split(/\//,$mates_file))[$#_];
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 if($bad_mates_file=~/.(s|b)am$/){
 | 
| 
 | 
    65     $bad_mates_file=~s/.(b|s)am$/.ab.sam/;
 | 
| 
 | 
    66     $bad_mates_file=$opts{o}.$bad_mates_file;
 | 
| 
 | 
    67 }
 | 
| 
 | 
    68 
 | 
| 
 | 
    69 else{
 | 
| 
 | 
    70     die "Error: mate_file with the extension <.bam> or <.sam> needed !\n";
 | 
| 
 | 
    71 }
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 my $good_mates_file;
 | 
| 
 | 
    74 if($opts{d}){
 | 
| 
 | 
    75     $good_mates_file=(split(/\//,$mates_file))[$#_];
 | 
| 
 | 
    76     $good_mates_file=~s/.(b|s)am$/.norm.sam/;
 | 
| 
 | 
    77     $good_mates_file=$opts{o}.$good_mates_file;
 | 
| 
 | 
    78 }
 | 
| 
 | 
    79 
 | 
| 
 | 
    80 my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY 
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 #------------------------------------------------------------------------------#
 | 
| 
 | 
    83 #Calculate mu and sigma
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 print LOG "\# Calculating mu and sigma lengths...\n";
 | 
| 
 | 
    88 print LOG "-- file=$mates_file\n";
 | 
| 
 | 
    89 print LOG "-- n=$opts{n}\n";
 | 
| 
 | 
    90 print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n";
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 my ($record, $sumX,$sumX2) = (0,0,0);
 | 
| 
 | 
    93 my $warn=$opts{n}/10;
 | 
| 
 | 
    94 my $prev_pair="FIRST";
 | 
| 
 | 
    95 
 | 
| 
 | 
    96 my $bam=($mates_file =~ /.bam$/)? 1:0;
 | 
| 
 | 
    97 
 | 
| 
 | 
    98 if($bam){
 | 
| 
 | 
    99     open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
 | 
| 
 | 
   100 }else{
 | 
| 
 | 
   101     open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
 | 
| 
 | 
   102 }
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 while(<MATES>){
 | 
| 
 | 
   105     
 | 
| 
 | 
   106     my @t=split;
 | 
| 
 | 
   107     
 | 
| 
 | 
   108     next if ($t[0]=~/^@/);
 | 
| 
 | 
   109     
 | 
| 
 | 
   110     my $current_pair=$t[0];
 | 
| 
 | 
   111     next if($current_pair eq $prev_pair);
 | 
| 
 | 
   112     $prev_pair=$current_pair;                                                   
 | 
| 
 | 
   113     
 | 
| 
 | 
   114     my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8]));
 | 
| 
 | 
   115     
 | 
| 
 | 
   116     next if (($t[1]&0x0004) || ($t[1]&0x0008));
 | 
| 
 | 
   117     next if ($length<$opts{s} || $length>$opts{S}) ;
 | 
| 
 | 
   118     
 | 
| 
 | 
   119     if($chr2 eq "="){
 | 
| 
 | 
   120 
 | 
| 
 | 
   121         $sumX += $length;							#add to sum and sum^2 for mean and variance calculation
 | 
| 
 | 
   122 	$sumX2 += $length*$length;
 | 
| 
 | 
   123         $record++;
 | 
| 
 | 
   124     }
 | 
| 
 | 
   125 
 | 
| 
 | 
   126     if($record>$warn){
 | 
| 
 | 
   127 	print LOG "-- $warn pairs analysed\n";
 | 
| 
 | 
   128         $warn+=$warn;
 | 
| 
 | 
   129     }
 | 
| 
 | 
   130     
 | 
| 
 | 
   131     last if ($record>$opts{n});
 | 
| 
 | 
   132     
 | 
| 
 | 
   133 }
 | 
| 
 | 
   134 close (MATES);
 | 
| 
 | 
   135 
 | 
| 
 | 
   136 $record--;
 | 
| 
 | 
   137 my $mu = $sumX/$record;
 | 
| 
 | 
   138 my $sigma = sqrt($sumX2/$record - $mu*$mu); 
 | 
| 
 | 
   139 
 | 
| 
 | 
   140 print LOG "-- Total : $record pairs analysed\n";
 | 
| 
 | 
   141 print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n";
 | 
| 
 | 
   142 
 | 
| 
 | 
   143 #------------------------------------------------------------------------------#
 | 
| 
 | 
   144 #------------------------------------------------------------------------------#
 | 
| 
 | 
   145 #Preprocessing pairs
 | 
| 
 | 
   146 
 | 
| 
 | 
   147 $warn=100000;
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 $record=0;
 | 
| 
 | 
   150 my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0);
 | 
| 
 | 
   151 
 | 
| 
 | 
   152 my $read_type=($opts{t})? "Illumina":"SOLiD";
 | 
| 
 | 
   153 my $pair_type=($opts{p})? "paired-end":"mate-paired";
 | 
| 
 | 
   154 
 | 
| 
 | 
   155 print LOG "\# Preprocessing pairs...\n";
 | 
| 
 | 
   156 print LOG "-- file= $mates_file\n";
 | 
| 
 | 
   157 print LOG "-- type= $read_type $pair_type reads\n";
 | 
| 
 | 
   158 print LOG "-- sigma threshold= $opts{f}\n";
 | 
| 
 | 
   159 print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n";
 | 
| 
 | 
   160 
 | 
| 
 | 
   161 my @header;
 | 
| 
 | 
   162 
 | 
| 
 | 
   163 if($bam){
 | 
| 
 | 
   164     open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
 | 
| 
 | 
   165     @header=<HEADER>;
 | 
| 
 | 
   166     close HEADER;
 | 
| 
 | 
   167     open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
 | 
| 
 | 
   168 }else{
 | 
| 
 | 
   169     open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
 | 
| 
 | 
   170 }
 | 
| 
 | 
   171 
 | 
| 
 | 
   172 open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n";
 | 
| 
 | 
   173 print AB @header if($bam);
 | 
| 
 | 
   174 
 | 
| 
 | 
   175 if($opts{d}){
 | 
| 
 | 
   176     open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n";
 | 
| 
 | 
   177     print NORM @header if($bam);
 | 
| 
 | 
   178 }
 | 
| 
 | 
   179 
 | 
| 
 | 
   180 $prev_pair="FIRST";
 | 
| 
 | 
   181 my $prev_bad;
 | 
| 
 | 
   182 
 | 
| 
 | 
   183 while(<MATES>){
 | 
| 
 | 
   184     
 | 
| 
 | 
   185     my @t=split;
 | 
| 
 | 
   186     my $bad=0;
 | 
| 
 | 
   187     
 | 
| 
 | 
   188     if ($t[0]=~/^@/){
 | 
| 
 | 
   189         print AB;
 | 
| 
 | 
   190         print NORM if ($opts{d});
 | 
| 
 | 
   191         next;
 | 
| 
 | 
   192     }
 | 
| 
 | 
   193     
 | 
| 
 | 
   194     my $current_pair=$t[0];
 | 
| 
 | 
   195     if($current_pair eq $prev_pair){
 | 
| 
 | 
   196         next if($prev_bad==-1);
 | 
| 
 | 
   197         if($prev_bad){
 | 
| 
 | 
   198             print AB;
 | 
| 
 | 
   199         }elsif(!$prev_bad){
 | 
| 
 | 
   200             print NORM if($opts{d});
 | 
| 
 | 
   201         }
 | 
| 
 | 
   202         next;
 | 
| 
 | 
   203     }
 | 
| 
 | 
   204     
 | 
| 
 | 
   205     $prev_pair=$current_pair;
 | 
| 
 | 
   206     
 | 
| 
 | 
   207     my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8]));
 | 
| 
 | 
   208     
 | 
| 
 | 
   209     if (($t[1]&0x0004) || ($t[1]&0x0008)){
 | 
| 
 | 
   210         $prev_bad=-1;
 | 
| 
 | 
   211         $count{unmap}++;
 | 
| 
 | 
   212         $record++;
 | 
| 
 | 
   213         next;
 | 
| 
 | 
   214         
 | 
| 
 | 
   215     }
 | 
| 
 | 
   216     
 | 
| 
 | 
   217     my $strand1 = (($t[1]&0x0010))? 'R':'F';
 | 
| 
 | 
   218     my $strand2 = (($t[1]&0x0020))? 'R':'F';
 | 
| 
 | 
   219     my $order1  = (($t[1]&0x0040))? '1':'2';
 | 
| 
 | 
   220     my $order2  = (($t[1]&0x0080))? '1':'2';
 | 
| 
 | 
   221     
 | 
| 
 | 
   222     if($order1 == 2){
 | 
| 
 | 
   223         ($strand1,$strand2)=($strand2,$strand1);
 | 
| 
 | 
   224         ($chr1,$chr2)=($chr2,$chr1);
 | 
| 
 | 
   225         ($pos1,$pos2)=($pos2,$pos1);
 | 
| 
 | 
   226         ($order1,$order2)=($order2,$order1);
 | 
| 
 | 
   227     }
 | 
| 
 | 
   228     
 | 
| 
 | 
   229     my $sense=$strand1.$strand2;
 | 
| 
 | 
   230     
 | 
| 
 | 
   231     if($chr1 ne "=" && $chr2 ne "="){
 | 
| 
 | 
   232         $bad=1;
 | 
| 
 | 
   233         $count{chr}++;
 | 
| 
 | 
   234     }
 | 
| 
 | 
   235     
 | 
| 
 | 
   236     if($opts{p}){ #paired-end
 | 
| 
 | 
   237         if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){
 | 
| 
 | 
   238             $bad=1;
 | 
| 
 | 
   239             $count{sense}++;
 | 
| 
 | 
   240         }
 | 
| 
 | 
   241     }else{ #mate-pair
 | 
| 
 | 
   242         if($opts{t}){ #Illumina
 | 
| 
 | 
   243             if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){
 | 
| 
 | 
   244             $bad=1;
 | 
| 
 | 
   245             $count{sense}++;
 | 
| 
 | 
   246             }
 | 
| 
 | 
   247         }else{ #SOLiD
 | 
| 
 | 
   248             if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){
 | 
| 
 | 
   249             $bad=1;
 | 
| 
 | 
   250             $count{sense}++;
 | 
| 
 | 
   251             }
 | 
| 
 | 
   252         }
 | 
| 
 | 
   253     }
 | 
| 
 | 
   254     
 | 
| 
 | 
   255     if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){
 | 
| 
 | 
   256         $bad=1;
 | 
| 
 | 
   257         $count{dist}++;
 | 
| 
 | 
   258     }
 | 
| 
 | 
   259     
 | 
| 
 | 
   260     if($bad){
 | 
| 
 | 
   261         print AB;
 | 
| 
 | 
   262         $count{ab}++;
 | 
| 
 | 
   263         $prev_bad=$bad;
 | 
| 
 | 
   264     }else{
 | 
| 
 | 
   265         print NORM if ($opts{d});
 | 
| 
 | 
   266         $count{norm}++;
 | 
| 
 | 
   267         $prev_bad=$bad;
 | 
| 
 | 
   268     }
 | 
| 
 | 
   269     
 | 
| 
 | 
   270     $record++;
 | 
| 
 | 
   271     
 | 
| 
 | 
   272     if($record>$warn){
 | 
| 
 | 
   273         print LOG "-- $warn pairs analysed\n";
 | 
| 
 | 
   274         $warn+=100000;
 | 
| 
 | 
   275     }
 | 
| 
 | 
   276 }
 | 
| 
 | 
   277 
 | 
| 
 | 
   278 close AB;
 | 
| 
 | 
   279 close NORM if($opts{d});
 | 
| 
 | 
   280 
 | 
| 
 | 
   281 print LOG "-- Total : $record pairs analysed\n";
 | 
| 
 | 
   282 print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n";
 | 
| 
 | 
   283 print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n";
 | 
| 
 | 
   284 print LOG "---- $count{ab} abnormal mapped pairs\n";
 | 
| 
 | 
   285 print LOG "------ $count{chr} pairs mapped on two different chromosomes\n";
 | 
| 
 | 
   286 print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n";
 | 
| 
 | 
   287 print LOG "------ $count{dist} pairs with incorrect insert size distance\n";
 | 
| 
 | 
   288 print LOG "--- $count{norm} correct mapped pairs\n";
 | 
| 
 | 
   289 
 | 
| 
 | 
   290 #------------------------------------------------------------------------------#
 | 
| 
 | 
   291 #------------------------------------------------------------------------------#
 | 
| 
 | 
   292 #OUTPUT
 | 
| 
 | 
   293 
 | 
| 
 | 
   294 if($bam){
 | 
| 
 | 
   295     
 | 
| 
 | 
   296     my $bam_file=$bad_mates_file;
 | 
| 
 | 
   297     $bam_file=~s/.sam$/.bam/;
 | 
| 
 | 
   298     print LOG "\# Converting sam to bam for abnormal mapped pairs\n";
 | 
| 
 | 
   299     system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log");
 | 
| 
 | 
   300     unlink($bad_mates_file);
 | 
| 
 | 
   301     print LOG "-- output created: $bam_file\n";
 | 
| 
 | 
   302 
 | 
| 
 | 
   303     system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY
 | 
| 
 | 
   304     
 | 
| 
 | 
   305     if($opts{d}){
 | 
| 
 | 
   306         $bam_file=$good_mates_file;
 | 
| 
 | 
   307         $bam_file=~s/.sam$/.bam/;
 | 
| 
 | 
   308         print LOG "\# Converting sam to bam for correct mapped pairs\n";
 | 
| 
 | 
   309         system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log");
 | 
| 
 | 
   310         unlink($good_mates_file);
 | 
| 
 | 
   311         print LOG "-- output created: $bam_file\n";
 | 
| 
 | 
   312 
 | 
| 
 | 
   313 	system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY
 | 
| 
 | 
   314 
 | 
| 
 | 
   315     }
 | 
| 
 | 
   316 
 | 
| 
 | 
   317 }
 | 
| 
 | 
   318 
 | 
| 
 | 
   319 else{
 | 
| 
 | 
   320     print LOG "-- output created: $bad_mates_file\n";
 | 
| 
 | 
   321     print LOG "-- output created: $good_mates_file\n" if($opts{d});
 | 
| 
 | 
   322 }
 | 
| 
 | 
   323 
 | 
| 
 | 
   324 close LOG;
 | 
| 
 | 
   325 
 | 
| 
 | 
   326 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY
 | 
| 
 | 
   327 
 | 
| 
 | 
   328 
 | 
| 
 | 
   329 #------------------------------------------------------------------------------#
 | 
| 
 | 
   330 #------------------------------------------------------------------------------#
 | 
| 
 | 
   331 sub decimal{
 | 
| 
 | 
   332     
 | 
| 
 | 
   333   my $num=shift;
 | 
| 
 | 
   334   my $digs_to_cut=shift;
 | 
| 
 | 
   335 
 | 
| 
 | 
   336   $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/);
 | 
| 
 | 
   337 
 | 
| 
 | 
   338   return $num;
 | 
| 
 | 
   339 }
 | 
| 
 | 
   340 #------------------------------------------------------------------------------#
 |