Mercurial > repos > yusuf > filter_bam_list
changeset 0:42af0b971c55 default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:33:46 -0600 |
parents | |
children | |
files | FilterBAMByNamesList.xml filter_bam_by_list |
diffstat | 2 files changed, 98 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FilterBAMByNamesList.xml Wed Mar 25 13:33:46 2015 -0600 @@ -0,0 +1,43 @@ +<?xml version="1.0"?> + +<tool id="filter_bam_by_list_1" name="Filter a BAM file"> + <description>against a list of desired genomic regions, by name</description> + <version_string>echo 1.0.0</version_string> + <command interpreter="perl">filter_bam_by_list $bam_file $named_gene_regions_bed + ## Handle reference file. + #if $geneNameSource.source == "file": + $geneNameSource.file_of_names + #else: + "${geneNameSource.name_list}" + #end if + $filtered_bam_file $retained_regions_bed $samtools_messages</command> + <inputs> + <param format="bam" name="bam_file" type="data" label="Source BAM (mapped reads) file"/> + <param format="bed" name="named_gene_regions_bed" type="data" label="BED file with names for genomic regions (e.g. the UCSC hg19 refFlat gene names file available under 'Shared Data Libraries' -> 'Genomic coding sequence regions')"/> + <conditional name="geneNameSource"> + <param name="source" type="select" label="How would you like to specify the list of target region names?"> + <option value="file">A file</option> + <option value="list">A list</option> + </param> + <when value="file"> + <param format="text" name="file_of_names" type="data" label="Text file with target region names (usually gene names), one per line" help="Mapped reads in regions with any of these names are retained"/> + </when> + <when value="list"> + <param name="name_list" type="text" label="Space separated list of target region names (usually gene names)" help="Mapped reads in regions with any of these names are retained"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="filtered_bam_file" format="bam" type="data" label="Reads mapped to target regions"/> + <data name="retained_regions_bed" format="bed" type="data" label="BED file of retained regions"/> + <data name="samtools_messages" format="text" type="data" label="Samtools messages"/> + </outputs> + + <tests/> + + <help> +This tool retains sequence reads of a BAM file that map to genomics regions matching any of the labels in the "names" file. This is useful for example to +report only a subset of an exome run that corresponds to a set of genes of interest. The names file should have one name per line. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_bam_by_list Wed Mar 25 13:33:46 2015 -0600 @@ -0,0 +1,55 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use IO::Handle; + +# Report lines of a file that have as one of the column values a value from the pattern file +@ARGV == 6 or die "Usage: $0 <input.bam> <named genomic regions.bed> <file of patterns> <filtered output.bam> <matched regions.bed> <output messages.txt>\n"; + +die "Input BAM file $ARGV[0] does not exist\n" if not -e $ARGV[0]; +die "Input BAM file $ARGV[0] is not readable\n" if not -r $ARGV[0]; + +my @alts; +if(-e $ARGV[2]){ + open(PATTERNS, $ARGV[2]) + or die "Cannot open $ARGV[1] for reading: $!\n"; + while(<PATTERNS>){ + chomp; + push @alts, $_; + } + close(PATTERNS); +} +else{ # else assume the arg is a list of names directly + @alts = split /\s+/, $ARGV[2]; +} + +my @regions; +my %seen; +my $regex = "^(?:".join("|", @alts).")\$"; +open(TAB, $ARGV[1]) + or die "Cannot open $ARGV[1] for reading: $!\n"; +while(<TAB>){ + chomp; + my @F = split /\t/, $_; + next unless @F > 3; + if($F[3] =~ /$regex/io){ + next if $seen{"$F[0]:$F[1]-$F[2]"}++; # sometimes regions are repeated, don't send these repeats to samtools + push @regions, $_; + } +} +close(TAB); + +die "No matches to desired names in the provided named genomic regions BED file, aborting filtered BAM file creation" if not @regions; + +open(BED, ">$ARGV[4]") + or die "Cannot open $ARGV[4] for writing: $!\n"; +print BED join("\n", @regions), "\n"; +close(BED); + +open(ERRFILE, ">$ARGV[5]") or die "Cannot open $ARGV[5] for writing: $!\n"; +STDOUT->fdopen(\*ERRFILE, "w") or die "Cannot redirect stdout to $ARGV[5]: $!\n"; +STDERR->fdopen(\*ERRFILE, "w") or die "Cannot redirect stderr to $ARGV[5]: $!\n"; +system("samtools view -h -L $ARGV[4] $ARGV[0] | samtools view -S -b - > $ARGV[3]") >> 8 + and die "Samtools failed: exit status ", ($?>>8), "\n"; +system("samtools index $ARGV[3]");