Mercurial > repos > yusuf > add_names_bed
diff add_names_to_bed @ 0:5e699c743e37 default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> | 
|---|---|
| date | Wed, 25 Mar 2015 13:09:20 -0600 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_names_to_bed Wed Mar 25 13:09:20 2015 -0600 @@ -0,0 +1,70 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Set::IntervalTree; + +# This script adds names to the first BED file regions, based on the names for regions provided +# by the second BED file. The output BED file is the same as the first BED file, but with names added where possible. +@ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n"; + +my %names; +open(NAMED_BED, $ARGV[1]) + or die "Cannot open $ARGV[1] for reading: $!\n"; +while(<NAMED_BED>){ + my @F = split /\t/, $_; + next unless @F > 3; + $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; + $names{$F[0]}->insert($F[3], $F[1], $F[2]); +} +close(NAMED_BED); +die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; + +open(INPUT_BED, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; +open(OUTPUT_BED, ">$ARGV[2]") + or die "Cannot open $ARGV[2] for writing: $!\n"; +my $num_datalines = 0; +my $num_changes = 0; +while(<INPUT_BED>){ + chomp; + my @F = split /\t/, $_; + if(@F < 3){ + # not a data line + print OUTPUT_BED $_, "\n"; + next; + } + $num_datalines++; + my $chr = $F[0]; + if(not exists $names{$chr}){ # must be a contig that was in the named BED file + if(exists $names{"chr$chr"}){ + $chr = "chr$chr"; + } + elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){ + $chr = $1; + } + else{ + next; + } + } + my $renames = $names{$chr}->fetch($F[1], $F[2]+1); + if(not @$renames){ + print OUTPUT_BED $_,"\n"; # no mod + } + else{ + $num_changes++; + my %h; + my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names + print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name + } +} +close(OUTPUT_BED); +close(INPUT_BED); + +if(open(LOG, ">>$ARGV[3]")){ + print LOG "Changed $num_changes/$num_datalines lines\n"; +} +else{ + print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n"; +} +
