| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 use Set::IntervalTree;
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 # This script adds names to the first BED file regions, based on the names for regions provided 
 | 
| 
 | 
     8 # by the second BED file.  The output BED file is the same as the first BED file, but with names added where possible.
 | 
| 
 | 
     9 @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n";
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 my %names;
 | 
| 
 | 
    12 open(NAMED_BED, $ARGV[1])
 | 
| 
 | 
    13   or die "Cannot open $ARGV[1] for reading: $!\n";
 | 
| 
 | 
    14 while(<NAMED_BED>){
 | 
| 
 | 
    15   my @F = split /\t/, $_;
 | 
| 
 | 
    16   next unless @F > 3;
 | 
| 
 | 
    17   $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; 
 | 
| 
 | 
    18   $names{$F[0]}->insert($F[3], $F[1], $F[2]);
 | 
| 
 | 
    19 }
 | 
| 
 | 
    20 close(NAMED_BED);
 | 
| 
 | 
    21 die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; 
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 open(INPUT_BED, $ARGV[0])
 | 
| 
 | 
    24   or die "Cannot open $ARGV[0] for reading: $!\n";
 | 
| 
 | 
    25 open(OUTPUT_BED, ">$ARGV[2]")
 | 
| 
 | 
    26   or die "Cannot open $ARGV[2] for writing: $!\n";
 | 
| 
 | 
    27 my $num_datalines = 0;
 | 
| 
 | 
    28 my $num_changes = 0;
 | 
| 
 | 
    29 while(<INPUT_BED>){
 | 
| 
 | 
    30   chomp;
 | 
| 
 | 
    31   my @F = split /\t/, $_;
 | 
| 
 | 
    32   if(@F < 3){
 | 
| 
 | 
    33     # not a data line
 | 
| 
 | 
    34     print OUTPUT_BED $_, "\n";
 | 
| 
 | 
    35     next;
 | 
| 
 | 
    36   }
 | 
| 
 | 
    37   $num_datalines++;
 | 
| 
 | 
    38   my $chr = $F[0];
 | 
| 
 | 
    39   if(not exists $names{$chr}){ # must be a contig that was in the named BED file
 | 
| 
 | 
    40     if(exists $names{"chr$chr"}){
 | 
| 
 | 
    41       $chr = "chr$chr";
 | 
| 
 | 
    42     }
 | 
| 
 | 
    43     elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){
 | 
| 
 | 
    44       $chr = $1;
 | 
| 
 | 
    45     }
 | 
| 
 | 
    46     else{
 | 
| 
 | 
    47       next;
 | 
| 
 | 
    48     }
 | 
| 
 | 
    49   }
 | 
| 
 | 
    50   my $renames = $names{$chr}->fetch($F[1], $F[2]+1);
 | 
| 
 | 
    51   if(not @$renames){
 | 
| 
 | 
    52     print OUTPUT_BED $_,"\n"; # no mod
 | 
| 
 | 
    53   }
 | 
| 
 | 
    54   else{
 | 
| 
 | 
    55     $num_changes++;
 | 
| 
 | 
    56     my %h;
 | 
| 
 | 
    57     my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names
 | 
| 
 | 
    58     print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name
 | 
| 
 | 
    59   }
 | 
| 
 | 
    60 }
 | 
| 
 | 
    61 close(OUTPUT_BED);
 | 
| 
 | 
    62 close(INPUT_BED);
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 if(open(LOG, ">>$ARGV[3]")){
 | 
| 
 | 
    65   print LOG "Changed $num_changes/$num_datalines lines\n";
 | 
| 
 | 
    66 }
 | 
| 
 | 
    67 else{
 | 
| 
 | 
    68   print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n";
 | 
| 
 | 
    69 }
 | 
| 
 | 
    70 
 |