Mercurial > repos > yusuf > add_names_bed
comparison add_names_to_bed @ 0:5e699c743e37 default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> | 
|---|---|
| date | Wed, 25 Mar 2015 13:09:20 -0600 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:5e699c743e37 | 
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use Set::IntervalTree; | |
| 6 | |
| 7 # This script adds names to the first BED file regions, based on the names for regions provided | |
| 8 # by the second BED file. The output BED file is the same as the first BED file, but with names added where possible. | |
| 9 @ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n"; | |
| 10 | |
| 11 my %names; | |
| 12 open(NAMED_BED, $ARGV[1]) | |
| 13 or die "Cannot open $ARGV[1] for reading: $!\n"; | |
| 14 while(<NAMED_BED>){ | |
| 15 my @F = split /\t/, $_; | |
| 16 next unless @F > 3; | |
| 17 $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]}; | |
| 18 $names{$F[0]}->insert($F[3], $F[1], $F[2]); | |
| 19 } | |
| 20 close(NAMED_BED); | |
| 21 die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names; | |
| 22 | |
| 23 open(INPUT_BED, $ARGV[0]) | |
| 24 or die "Cannot open $ARGV[0] for reading: $!\n"; | |
| 25 open(OUTPUT_BED, ">$ARGV[2]") | |
| 26 or die "Cannot open $ARGV[2] for writing: $!\n"; | |
| 27 my $num_datalines = 0; | |
| 28 my $num_changes = 0; | |
| 29 while(<INPUT_BED>){ | |
| 30 chomp; | |
| 31 my @F = split /\t/, $_; | |
| 32 if(@F < 3){ | |
| 33 # not a data line | |
| 34 print OUTPUT_BED $_, "\n"; | |
| 35 next; | |
| 36 } | |
| 37 $num_datalines++; | |
| 38 my $chr = $F[0]; | |
| 39 if(not exists $names{$chr}){ # must be a contig that was in the named BED file | |
| 40 if(exists $names{"chr$chr"}){ | |
| 41 $chr = "chr$chr"; | |
| 42 } | |
| 43 elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){ | |
| 44 $chr = $1; | |
| 45 } | |
| 46 else{ | |
| 47 next; | |
| 48 } | |
| 49 } | |
| 50 my $renames = $names{$chr}->fetch($F[1], $F[2]+1); | |
| 51 if(not @$renames){ | |
| 52 print OUTPUT_BED $_,"\n"; # no mod | |
| 53 } | |
| 54 else{ | |
| 55 $num_changes++; | |
| 56 my %h; | |
| 57 my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names | |
| 58 print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name | |
| 59 } | |
| 60 } | |
| 61 close(OUTPUT_BED); | |
| 62 close(INPUT_BED); | |
| 63 | |
| 64 if(open(LOG, ">>$ARGV[3]")){ | |
| 65 print LOG "Changed $num_changes/$num_datalines lines\n"; | |
| 66 } | |
| 67 else{ | |
| 68 print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n"; | |
| 69 } | |
| 70 | 
