view extend_bed_regions @ 0:cb1c17dddc14 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:32:42 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;

# This script adds up to X bases to either end of BED file regions (CAN BE NEGATIVE). The tool will merge regions that overlap after the expansion.
@ARGV == 4 or die "Usage: $0 <input regions.bed> <# bases> <5|3|both> <output.bed>\nWhere 5, 3 or both controls which flank is modified\n";

my $extension = $ARGV[1];
my $ends = $ARGV[2];

my %chrs;
my @chr_names;
open(BED, $ARGV[0])
  or die "Cannot open $ARGV[0] for reading: $!\n";
# assuming input in sorted by start per contig
while(<BED>){
  chomp;
  my @F = split /\t/, $_;
  my $strand = @F > 5 ? $F[5] : "+"; # is the strand field present?
  my $start;
  my $stop;
  if($strand eq "+"){
    if($ends eq "both" or $ends eq "5"){
      $start = $F[1]-$extension;
      $start = 1 if $start < 1;
    }
    if($ends eq "both" or $ends eq "3"){
      $stop = $F[2]+$extension; # can't check if okay unless we knew the chr sizes...
    }
  }
  else{
    # negative strand
    if($ends eq "both" or $ends eq "3"){
      $start = $F[1]-$extension;
      $start = 1 if $start < 1;
    }
    if($ends eq "both" or $ends eq "5"){
      $stop = $F[2]+$extension; # can't check if okay unless we knew the chr sizes...
    }
  }
  if(not exists $chrs{$F[0]}){
    $chrs{$F[0]} = [[$start,$stop]];
    push @chr_names, $F[0];
    next;
  }
  my $last_interval = $chrs{$F[0]}->[$#{$chrs{$F[0]}}];
  if($start <= $last_interval->[1]){ #overlaps previous
    if($stop > $last_interval->[1]){ # extends beyond previous
      $last_interval->[1] = $stop;
    } #else it doesn't contribute new bases
  }
  else{ # standalone
    push @{$chrs{$F[0]}}, [$start, $stop];
  }
}
close(BED);

open(OUTPUT_BED, ">$ARGV[3]")
  or die "Cannot open $ARGV[3] for writing: $!\n";
for my $chr_name (@chr_names){
  for my $interval (@{$chrs{$chr_name}}){
    print OUTPUT_BED join("\t",$chr_name,$interval->[0],$interval->[1]), "\n";
  }
}
close(OUTPUT_BED);