Mercurial > repos > yusuf > filter_bam_list
view filter_bam_by_list @ 0:42af0b971c55 default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:33:46 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; use IO::Handle; # Report lines of a file that have as one of the column values a value from the pattern file @ARGV == 6 or die "Usage: $0 <input.bam> <named genomic regions.bed> <file of patterns> <filtered output.bam> <matched regions.bed> <output messages.txt>\n"; die "Input BAM file $ARGV[0] does not exist\n" if not -e $ARGV[0]; die "Input BAM file $ARGV[0] is not readable\n" if not -r $ARGV[0]; my @alts; if(-e $ARGV[2]){ open(PATTERNS, $ARGV[2]) or die "Cannot open $ARGV[1] for reading: $!\n"; while(<PATTERNS>){ chomp; push @alts, $_; } close(PATTERNS); } else{ # else assume the arg is a list of names directly @alts = split /\s+/, $ARGV[2]; } my @regions; my %seen; my $regex = "^(?:".join("|", @alts).")\$"; open(TAB, $ARGV[1]) or die "Cannot open $ARGV[1] for reading: $!\n"; while(<TAB>){ chomp; my @F = split /\t/, $_; next unless @F > 3; if($F[3] =~ /$regex/io){ next if $seen{"$F[0]:$F[1]-$F[2]"}++; # sometimes regions are repeated, don't send these repeats to samtools push @regions, $_; } } close(TAB); die "No matches to desired names in the provided named genomic regions BED file, aborting filtered BAM file creation" if not @regions; open(BED, ">$ARGV[4]") or die "Cannot open $ARGV[4] for writing: $!\n"; print BED join("\n", @regions), "\n"; close(BED); open(ERRFILE, ">$ARGV[5]") or die "Cannot open $ARGV[5] for writing: $!\n"; STDOUT->fdopen(\*ERRFILE, "w") or die "Cannot redirect stdout to $ARGV[5]: $!\n"; STDERR->fdopen(\*ERRFILE, "w") or die "Cannot redirect stderr to $ARGV[5]: $!\n"; system("samtools view -h -L $ARGV[4] $ARGV[0] | samtools view -S -b - > $ARGV[3]") >> 8 and die "Samtools failed: exit status ", ($?>>8), "\n"; system("samtools index $ARGV[3]");