view add_names_to_bed @ 0:5e699c743e37 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:09:20 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;
use Set::IntervalTree;

# This script adds names to the first BED file regions, based on the names for regions provided 
# by the second BED file.  The output BED file is the same as the first BED file, but with names added where possible.
@ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n";

my %names;
open(NAMED_BED, $ARGV[1])
  or die "Cannot open $ARGV[1] for reading: $!\n";
while(<NAMED_BED>){
  my @F = split /\t/, $_;
  next unless @F > 3;
  $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; 
  $names{$F[0]}->insert($F[3], $F[1], $F[2]);
}
close(NAMED_BED);
die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; 

open(INPUT_BED, $ARGV[0])
  or die "Cannot open $ARGV[0] for reading: $!\n";
open(OUTPUT_BED, ">$ARGV[2]")
  or die "Cannot open $ARGV[2] for writing: $!\n";
my $num_datalines = 0;
my $num_changes = 0;
while(<INPUT_BED>){
  chomp;
  my @F = split /\t/, $_;
  if(@F < 3){
    # not a data line
    print OUTPUT_BED $_, "\n";
    next;
  }
  $num_datalines++;
  my $chr = $F[0];
  if(not exists $names{$chr}){ # must be a contig that was in the named BED file
    if(exists $names{"chr$chr"}){
      $chr = "chr$chr";
    }
    elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){
      $chr = $1;
    }
    else{
      next;
    }
  }
  my $renames = $names{$chr}->fetch($F[1], $F[2]+1);
  if(not @$renames){
    print OUTPUT_BED $_,"\n"; # no mod
  }
  else{
    $num_changes++;
    my %h;
    my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names
    print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name
  }
}
close(OUTPUT_BED);
close(INPUT_BED);

if(open(LOG, ">>$ARGV[3]")){
  print LOG "Changed $num_changes/$num_datalines lines\n";
}
else{
  print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n";
}