changeset 2:825c3b7313b6 draft

Uploaded
author romaingred
date Thu, 16 Nov 2017 10:14:55 -0500
parents f55d249522a0
children 98d99e51579a
files gred_rnaseq-2369cfe4f737/alignTophat.pl gred_rnaseq-2369cfe4f737/alignTophat.xml gred_rnaseq-2369cfe4f737/bed_pipe.loc.sample gred_rnaseq-2369cfe4f737/tool_data_table_conf.xml.sample gred_rnaseq-2369cfe4f737/tool_dependencies.xml
diffstat 5 files changed, 504 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gred_rnaseq-2369cfe4f737/alignTophat.pl	Thu Nov 16 10:14:55 2017 -0500
@@ -0,0 +1,334 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $max_procs = 8;
+
+my ($ma, $fastq, $fastqN, $mis,$findSens, $contaminants, $directional, $build_contaminant, $ref, $build_ref, $annotation, $dirBam, $dirBam_cont,$dirBed, $dirBedgraph, $outText, $outSimple, $random_w);
+
+GetOptions (
+"direction:s" => \$directional,
+"random_w:s" => \$random_w,
+"find:s" => \$findSens,
+"fastq=s" => \$fastq,
+"fastq_n=s" => \$fastqN, 
+"ma=i" => \$ma,
+"mis=i" => \$mis,
+"contaminants=s" => \$contaminants,
+"anno=s" => \$annotation,
+"build_contaminant" => \$build_contaminant,
+"ref=s" => \$ref,
+"build_ref" => \$build_ref,
+"dirbamCont=s" => \$dirBam_cont,
+"dirbam=s" => \$dirBam,
+"dirbedgraph=s" => \$dirBedgraph,
+"text=s" => \$outText,
+);
+
+$directional = lc($directional);
+$findSens = lc($findSens);
+$random_w = lc($random_w);
+
+if ($build_ref)
+{
+  `(bowtie2-build $ref $ref) 2> /dev/null `;
+}
+
+if ($build_contaminant)
+{
+  `(bowtie2-build $contaminants $contaminants) 2> /dev/null `;
+}
+
+#"outsimple=s" => \$outSimple
+#"annotation=s" => \$annotation,
+
+#$annotation = "/data/visiteur/annotation/ara/TAIR10_all_sorted.bed";
+#$annotation = "/data/home/pogorelcnikr/SG/humanGenes.bed";
+
+# mapping against contaminants databases  in: contaminants, fastq, # of mismatches, output_dir out: tophat dir
+tophat2($fastq, $contaminants, $mis, 1, 'dir_contaminant');
+`bamToFastq  -i dir_contaminant/unmapped.bam -fq clean.fastq`;
+
+# mapping against contaminants databases  in: $ref, fastq, # of mismatches, output_dir out: tophat dir
+tophat2('clean.fastq', $ref, $mis, $max_procs, 'dir_reference');
+
+my $repTot = 0;
+if ($ma == 0)
+{
+  open (my $rT,  'dir_reference/align_summary.txt');
+  my @repT = <$rT>;
+  $repTot = $1 if $repT[2] =~ /\s+Mapped\s+:\s+(\d+).*/;
+  close $rT;
+}
+else
+{
+  $repTot = $ma;
+}
+
+# Creation of bedgraphs
+my $dir_bedgraph = $fastqN.'_bedgraph/';
+findUnique('dir_reference/accepted_hits.bam', $dir_bedgraph, $repTot);
+
+# count in: unique_mapped.bam out: raw_count_simple antisens_count
+if ($random_w eq 'true')
+{
+  reference('rand_sorted.bam', $repTot);
+  `mv rand_sorted.bam dir_reference/random_sorted.bam`;
+}
+else
+{
+  reference('ac_sorted.bam', $repTot);
+  `mv ac_sorted.bam dir_reference/unique_sorted.bam`;
+}
+
+zip('dir_reference', $dirBam);
+zip('dir_contaminant', $dirBam_cont);
+zip( $dir_bedgraph , $dirBedgraph);
+
+#################################
+# inputs:
+# fastq $_[0]
+# bowtie2_index $_[1]
+# mismatches # $_[2]
+# output directory $_[3]
+# max_procs $_[4]
+#################################
+sub tophat2
+{
+  my ($fastq, $index, $mismtaches, $proc, $out_repertory) = @_;
+  $out_repertory = $out_repertory.'/' unless $out_repertory =~ /\/$/;
+  # print STDERR "/data/visiteur/tophat-2.0.12.Linux_x86_64/tophat2 -p $proc -N $mismtaches --b2-very-sensitive -o $out_repertory $index $fastq\n";
+  `tophat2 -p $proc -N $mismtaches --b2-very-sensitive -o $out_repertory $index $fastq 2> /dev/stdin`;
+}
+############################################################################################
+
+#################################
+# inputs:
+# bam $_[0]
+# output directory $_[1]
+#################################
+sub findUnique
+{
+  my ($bam, $out_bedgraph, $sc) = @_;
+  
+  mkdir $out_bedgraph;
+
+  my $bedg_uni =  $out_bedgraph.'/'.$fastqN.'_unique.bedgraph' ;
+  my $bedg_all_multi = $out_bedgraph.'/'.$fastqN.'_all_multi.bedgraph' ;
+  my $bedg_random = $out_bedgraph.'/'.$fastqN.'_random_multi.bedgraph' ;
+
+  my $bedg_uni_p =  $out_bedgraph.'/'.$fastqN.'_unique_plusStrand.bedgraph' ;
+  my $bedg_all_multi_p = $out_bedgraph.'/'.$fastqN.'_all_multi_plusStrand.bedgraph' ;
+  my $bedg_random_p = $out_bedgraph.'/'.$fastqN.'_random_multi_plusStrand.bedgraph' ;
+
+  my $bedg_uni_m =  $out_bedgraph.'/'.$fastqN.'_unique_minusStrand.bedgraph' ;
+  my $bedg_all_multi_m = $out_bedgraph.'/'.$fastqN.'_all_multi_minusStrand.bedgraph' ;
+  my $bedg_random_m = $out_bedgraph.'/'.$fastqN.'_random_multi_minusStrand.bedgraph' ;
+  
+  my $sca = 1000000/$sc;
+
+  #unique file
+  `samtools view -H $bam > ac.sam 2> /dev/null && samtools view -@ $max_procs $bam 2> /dev/null |grep NH:i:1\$ >>  ac.sam && samtools view -@ $max_procs -Shb ac.sam > ac.bam 2> /dev/null`;
+  `samtools sort -@ $max_procs ac.bam -o ac_sorted.bam 2> /dev/null`;
+  `genomeCoverageBed -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni 2> /dev/null`;
+  if ($directional eq 'true')
+  {
+    `genomeCoverageBed -strand + -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni_p 2> /dev/null`;
+    `genomeCoverageBed -strand - -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni_m 2> /dev/null`;
+  }
+  #all multi
+  `samtools sort -@ $max_procs $bam -o accepted_hits_sorted.bam 2> /dev/null`;
+  `genomeCoverageBed -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi 2> /dev/null`;
+  if ($directional eq 'true')
+  {
+    `genomeCoverageBed -strand + -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi_p 2> /dev/null`;
+    `genomeCoverageBed -strand - -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi_m 2> /dev/null`;
+  }
+  #random multi
+  `samtools view -@ $max_procs -hb -F256 $bam > rand.bam 2> /dev/null`;
+  `samtools sort -@ $max_procs rand.bam -o rand_sorted.bam 2> /dev/null`;
+  `genomeCoverageBed -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random 2> /dev/null`;  
+  if ($directional eq 'true')
+  {
+    `genomeCoverageBed -strand + -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random_p 2> /dev/null`;
+    `genomeCoverageBed -strand - -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random_m 2> /dev/null`;
+  }
+}
+############################################################################################
+
+#################################
+# inputs:
+# bam $_[0]
+# output directory $_[1]
+#################################
+sub reference
+{
+  my $bam = shift;
+  my $mapnum = shift;  
+
+  my @tabRef = ();
+  
+  open (my $anno, $annotation) || die "cannot open annotation file!\n";
+  while(<$anno>)
+  {
+    chomp $_;
+    my @split = split /\t/, $_;
+    my $size = $split[2] - $split[1];
+    push @tabRef, [$_,0,0,$size];
+  }
+  close $anno;
+  
+  `bedtools bamtobed -i $bam > ac.bed`;
+  
+ # my $mapnum = 0;
+ # open (my $bed, 'ac.bed') || die "cannot open ac.bed\n";
+ # while(<$bed>){ $mapnum++; }
+ # close $bed;
+  
+  if ($directional eq 'true')
+  {
+    #Same strand
+    `bedtools intersect -s -wao -b ac.bed -a $annotation> intersectStranded.bed`;
+    #Different strand
+    `bedtools intersect -S -wao -b ac.bed -a $annotation> intersectOppositeStrand.bed`;
+  
+    open (my $strand, 'intersectStranded.bed') || die "cannot open intersectStranded.bed\n";
+    my $i = 0;
+    my $firstLine = <$strand>; chomp $firstLine;
+    my @split = split /\t/, $firstLine;
+    my $prevId = $split[3];
+    if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
+    {
+      $tabRef[$i]->[1] = 1;
+    }
+    while(<$strand>)
+    {
+      chomp $_;
+      @split = split /\t/, $_;
+      if ($split[3] ne $prevId)
+      {
+        $i++;
+      }
+      if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
+      {
+        $tabRef[$i]->[1] += 1;
+      }
+      $prevId = $split[3];
+    }
+    close $strand;
+  
+    open ($strand, 'intersectOppositeStrand.bed') || die "cannot open intersectOppositeStrand.bed\n";
+    $i = 0;
+    $firstLine = <$strand>; chomp $firstLine;
+    @split = split /\t/, $firstLine;
+    $prevId = $split[3];
+    if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
+    {
+      $tabRef[$i]->[2] = 1;
+    }
+    while(<$strand>)
+    {
+      chomp $_;
+      @split = split /\t/, $_;
+      if ($split[3] ne $prevId)
+      {
+        $i++;
+      }
+      if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
+      {
+        $tabRef[$i]->[2] += 1;
+      }
+      $prevId = $split[3];
+    }
+    close $strand;
+  }
+
+  else
+  {
+    `bedtools intersect -wao -b ac.bed -a $annotation> intersect.bed`;
+
+    open (my $strand, 'intersect.bed') || die "cannot open intersect.bed\n";
+    my $i = 0;
+    my $firstLine = <$strand>; chomp $firstLine;
+    my @split = split /\t/, $firstLine;
+    my $prevId = $split[3];
+    if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
+    {
+      $tabRef[$i]->[1] = 1;
+    }
+    while(<$strand>)
+    {
+      chomp $_;
+      @split = split /\t/, $_;
+      if ($split[3] ne $prevId)
+      {
+        $i++;
+      }
+      if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
+      {
+        $tabRef[$i]->[1] += 1;
+      }
+      $prevId = $split[3];
+    }
+    close $strand; 
+  }
+  
+  my $totplus = 0; my $totminus = 0;
+  foreach my $outPut (@tabRef)
+  {
+    $totplus += $outPut->[1];
+    $totminus += $outPut->[2];
+  }
+    
+  open (my $out, ">".$outText) || die "cannot open raw_count.txt\n";
+  print $out "Chr\tstart\tend\tID\tscore\tstrand\ttype\tD1\tD2\tD3\treads\trpm\treads on opposite strand\trpm\n";
+  my ($Sr, $Rr) = (0,0);
+  foreach my $outPut (@tabRef)
+  {
+    if ($totplus > $totminus || $findSens eq 'false')
+    {
+      $Sr = rpm($outPut->[1],$mapnum);
+      $Rr = rpm($outPut->[2],$mapnum);
+      print $out $outPut->[0]."\t".$outPut->[1]."\t".$Sr."\t".$outPut->[2]."\t".$Rr."\n";
+    }
+    else
+    {
+      $Rr = rpm($outPut->[1],$mapnum);
+      $Sr = rpm($outPut->[2],$mapnum);
+      print $out $outPut->[0]."\t".$outPut->[2]."\t".$Sr."\t".$outPut->[1]."\t".$Rr."\n";
+    }
+  }
+  close $out;
+}
+############################################################################################
+
+#################################
+# inputs:
+# bam $_[0]
+# output directory $_[1]
+#################################
+sub rpm
+{
+  my $raw_count = shift;
+  my $mapped_number = shift;
+  my $RPM = 0;
+  $RPM = ($raw_count * 1000000) / ($mapped_number) if  $mapped_number != 0;
+  return $RPM;
+}
+############################################################################################
+
+#################################
+# inputs:
+# bam $_[0]
+# output directory $_[1]
+#################################
+sub zip
+{
+  my ($in_dir, $out_tar) = @_;
+  `tar -cvf tmp.tar $in_dir/`;
+  `mv tmp.tar $out_tar`;
+  rmdir $in_dir;
+}
+############################################################################################
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gred_rnaseq-2369cfe4f737/alignTophat.xml	Thu Nov 16 10:14:55 2017 -0500
@@ -0,0 +1,151 @@
+<tool id="alignTophat" name="align RNAseq" version="0.0.1">
+  <description>Align directionnal RNAseq to obtain bedgraphs and raw count file</description>
+  <command interpreter="perl">
+    
+    ./alignTophat.pl
+    
+    --fastq ${first_input}
+    --fastq_n ${first_input.name}
+    
+    #if $Genome.refGenomeSource == "history":
+    --ref "${Genome.ownFile}"
+    --build_ref
+    #else:
+    --ref "${Genome.indices.fields.path}"
+    #end if
+    
+    #if $contaminants.refGenomeSource == "history":
+    --contaminants "${contaminants.ownFile}"
+    --build_contaminant
+    #else:
+    --contaminants "${contaminants.indices.fields.path}"
+    #end if
+    #if $anno.bedpipe == "history":
+      --annot "${anno.ownFile}"
+    #else:
+      --anno "${anno.indices.fields.path}"
+    #end if
+
+    --mis  $mis
+    --ma $ma
+    
+    --dirbam $dirbam
+    --dirbamCont $dirbamcont
+    --dirbedgraph $dirbedgraph
+    
+    --text $text
+    
+    --random_w $rand
+    --find $idDir.find
+    --direction $idDir.direc
+  </command>
+  <inputs>
+    
+    <param format="fastqsanger" name="first_input" type="data" label="fastq produced by fasteris" help=""/>
+    
+    
+    <conditional name="Genome">
+      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="indices" type="select" label="Select a reference genome">
+          <options from_data_table="bowtie2_indexes">
+            <filter type="sort_by" column="2" />
+            <validator type="no_options" message="No indexes are available" />
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
+      </when>
+    </conditional>
+    
+    
+    <conditional name="Genome">
+      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="indices" type="select" label="Select a reference genome">
+          <options from_data_table="bowtie2_indexes">
+            <filter type="sort_by" column="2" />
+            <validator type="no_options" message="No indexes are available" />
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
+      </when>
+    </conditional>
+    
+    <conditional name="contaminants">
+      <param name="refGenomeSource" type="select" label="Will you select contaminants database from your history or use a built-in index?">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="indices" type="select" label="Select contaminants reference">
+          <options from_data_table="bowtie2_indexes">
+            <filter type="sort_by" column="2" />
+            <validator type="no_options" message="No indexes are available" />
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
+      </when>
+    </conditional>
+    
+    <conditional name="anno">
+      <param name="bedpipe" type="select" label="Will you select an annotation file from your history or use a built-in index?" help="tab file format, (chr start end id score strand type description1 description2 description3">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="indices" type="select" label="Select annotation file">
+          <options from_data_table="bed_pipe">
+            <filter type="sort_by" column="2" />
+            <validator type="no_options" message="No indexes are available" />
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="bed" metadata_name="dbkey" label="Select a reference from history" />
+      </when>
+    </conditional>
+    
+    
+    <param name="mis" type="integer" value="0" label="mismatches"/>
+    <param name="ma" type="integer" value="0" label="normalization value" help="e.g number of mappers (if 0 auto)"/> 
+    <param name="rand" type="boolean" checked="false" label="work on multi-mappers randomly assigned"/>
+
+
+    <conditional name="idDir">
+      <param name="direc" type="boolean" checked="true" label="directional library"/>
+      <when value="false">
+        <param name="find" type="hidden" value="false"/>
+      </when>
+      <when value="true">
+        <param name="find" type="boolean" checked="true" label="auto sens finder"/>
+      </when>
+     </conditional>
+  </inputs>
+  
+  <outputs>
+    <data format="tar" name="dirbam" label="alignement_zip_${first_input.name}"/>
+    <data format="tar" name="dirbamcont" label="contaminant_zip_${first_input.name}"/>
+    <data format="tar" name="dirbedgraph" label="bedgraph_zip_${first_input.name}"/>
+    <data format="tabular" name="text" label="raw_count_tabular_${first_input.name}_rand_${rand}"/>
+  </outputs>
+  
+  <requirements>
+    <requirement type="package" version="2.2.6">bowtie2</requirement>
+    <requirement type="package" version="2.1.0">tophat</requirement>
+    <requirement type="package" version="2.17.0">bedtools</requirement>
+    <requirement type="package" version="0.1.19">samtools</requirement>
+   </requirements>
+  
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gred_rnaseq-2369cfe4f737/bed_pipe.loc.sample	Thu Nov 16 10:14:55 2017 -0500
@@ -0,0 +1,2 @@
+#<unique_id>    <database_caption>      <base_name_path>
+dmel	dmel_annotation	/path/to/annotation/dmel/dmel_gene.bed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gred_rnaseq-2369cfe4f737/tool_data_table_conf.xml.sample	Thu Nov 16 10:14:55 2017 -0500
@@ -0,0 +1,6 @@
+<tables>
+    <table name="bed_pipe" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/bed_pipe.loc" />
+    </table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gred_rnaseq-2369cfe4f737/tool_dependencies.xml	Thu Nov 16 10:14:55 2017 -0500
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="bowtie2" version="2.2.6">
+	  </package>
+    <package name="tophat" version="2.1.0">
+    </package>
+    <package name="bedtools" version="2.24.0">
+    </package>
+    <package name="samtools" version="1.3.1">
+    </package>
+</tool_dependency>