changeset 0:42af0b971c55 default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:33:46 -0600
parents
children
files FilterBAMByNamesList.xml filter_bam_by_list
diffstat 2 files changed, 98 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FilterBAMByNamesList.xml	Wed Mar 25 13:33:46 2015 -0600
@@ -0,0 +1,43 @@
+<?xml version="1.0"?>
+
+<tool id="filter_bam_by_list_1" name="Filter a BAM file">
+  <description>against a list of desired genomic regions, by name</description>
+  <version_string>echo 1.0.0</version_string>
+  <command interpreter="perl">filter_bam_by_list $bam_file $named_gene_regions_bed 
+            ## Handle reference file.
+            #if $geneNameSource.source == "file":
+                $geneNameSource.file_of_names 
+            #else:
+                "${geneNameSource.name_list}"
+            #end if
+     $filtered_bam_file $retained_regions_bed $samtools_messages</command>
+  <inputs>
+    <param format="bam" name="bam_file" type="data" label="Source BAM (mapped reads) file"/>
+    <param format="bed" name="named_gene_regions_bed" type="data" label="BED file with names for genomic regions (e.g. the UCSC hg19 refFlat gene names file available under 'Shared Data Libraries' -&gt; 'Genomic coding sequence regions')"/>
+    <conditional name="geneNameSource">
+       <param name="source" type="select" label="How would you like to specify the list of target region names?">
+            <option value="file">A file</option>
+            <option value="list">A list</option>
+       </param>
+       <when value="file">
+         <param format="text" name="file_of_names" type="data" label="Text file with target region names (usually gene names), one per line" help="Mapped reads in regions with any of these names are retained"/>
+       </when>
+       <when value="list">
+         <param name="name_list" type="text" label="Space separated list of target region names (usually gene names)" help="Mapped reads in regions with any of these names are retained"/>
+       </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="filtered_bam_file" format="bam" type="data" label="Reads mapped to target regions"/>
+    <data name="retained_regions_bed" format="bed" type="data" label="BED file of retained regions"/>
+    <data name="samtools_messages" format="text" type="data" label="Samtools messages"/>
+  </outputs>
+
+  <tests/>
+
+  <help>
+This tool retains sequence reads of a BAM file that map to genomics regions matching any of the labels in the "names" file. This is useful for example to 
+report only a subset of an exome run that corresponds to a set of genes of interest. The names file should have one name per line. 
+ </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_bam_by_list	Wed Mar 25 13:33:46 2015 -0600
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use IO::Handle;
+
+# Report lines of a file that have as one of the column values a value from the pattern file
+@ARGV == 6 or die "Usage: $0 <input.bam> <named genomic regions.bed> <file of patterns> <filtered output.bam> <matched regions.bed> <output messages.txt>\n";
+
+die "Input BAM file $ARGV[0] does not exist\n" if not -e $ARGV[0];
+die "Input BAM file $ARGV[0] is not readable\n" if not -r $ARGV[0];
+
+my @alts;
+if(-e $ARGV[2]){
+  open(PATTERNS, $ARGV[2])
+    or die "Cannot open $ARGV[1] for reading: $!\n";
+  while(<PATTERNS>){
+    chomp;
+    push @alts, $_;
+  }
+  close(PATTERNS);
+}
+else{ # else assume the arg is a list of names directly
+  @alts = split /\s+/, $ARGV[2];
+}
+
+my @regions;
+my %seen;
+my $regex = "^(?:".join("|", @alts).")\$";
+open(TAB, $ARGV[1])
+  or die "Cannot open $ARGV[1] for reading: $!\n";
+while(<TAB>){
+  chomp;
+  my @F = split /\t/, $_;
+  next unless @F > 3;
+  if($F[3] =~ /$regex/io){
+    next if $seen{"$F[0]:$F[1]-$F[2]"}++; # sometimes regions are repeated, don't send these repeats to samtools
+    push @regions, $_;
+  }
+}
+close(TAB);
+
+die "No matches to desired names in the provided named genomic regions BED file, aborting filtered BAM file creation" if not @regions;
+
+open(BED, ">$ARGV[4]")
+  or die "Cannot open $ARGV[4] for writing: $!\n";
+print BED join("\n", @regions), "\n";
+close(BED);
+
+open(ERRFILE, ">$ARGV[5]") or die "Cannot open $ARGV[5] for writing: $!\n";
+STDOUT->fdopen(\*ERRFILE, "w") or die "Cannot redirect stdout to $ARGV[5]: $!\n";
+STDERR->fdopen(\*ERRFILE, "w") or die "Cannot redirect stderr to $ARGV[5]: $!\n";
+system("samtools view -h -L $ARGV[4] $ARGV[0] | samtools view -S -b - > $ARGV[3]") >> 8 
+  and die "Samtools failed: exit status ", ($?>>8), "\n"; 
+system("samtools index $ARGV[3]");