Mercurial > repos > sarahinraauzeville > star
changeset 2:80e19490ec6a draft
Uploaded
author | sarahinraauzeville |
---|---|
date | Tue, 12 Dec 2017 10:08:56 -0500 |
parents | e8dbc8b9a59a |
children | aeebcdb9b8b2 |
files | sm_STAR2_V2.pl |
diffstat | 1 files changed, 236 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sm_STAR2_V2.pl Tue Dec 12 10:08:56 2017 -0500 @@ -0,0 +1,236 @@ +#!/usr/bin/perl -w + +# usage : perl sm_STAR.pl <read1.fastq.gz> <read2.fastq.gz> +# 10/02/2014 - Wrapper du traitement des données RNAseq +# Sarah Maman +# Copyright (C) 2014 INRA +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +use strict; +use File::Basename; +use Getopt::Long; +use lib "$ENV{'MY_GALAXY_DIR'}"; +use GalaxyPath; + +my $cfg = GalaxyPath->new( -file => $ENV{"GALAXY_CONFIG_FILE"}); +my $PATH = $cfg->my_path( 'workPath', 'MYWORKSPACE' ); +my $STAR = $cfg->my_path( 'toolsPath', 'STAR_PATH' ); + + + +my $Nthreads; +my $genome_path; +my $reads_selector; +my $input_read; +my $Read1fastqgz; +my $Read2fastqgz; +my $alignIntronMin; +my $alignIntronMax; +my $outFilterMismatchNmax; +my $orientation; +my $refownfastaref; +my $refselector; +my $refowngtf; +my $compress; +my $cufflinks; +my $outputfile; +my $outputfileT; +my $outputlogSJ; +my $outputlogfinal; + + +Getopt::Long::Configure( 'no_ignorecase', 'bundling' ); +GetOptions ( + 'runThreadN=i' => \$Nthreads, + 'genomeDir=s' => \$genome_path, + 'refselector=s' => \$refselector, + 'refownfastaref=s' => \$refownfastaref, + 'refowngtf=s' => \$refowngtf, + 'compress=s' => \$compress, + 'cufflinks=s' => \$cufflinks, + 'readsselector=s'=> \$reads_selector, + 'readFilesIn1=s' => \$Read1fastqgz, + 'readFilesIn2=s' => \$Read2fastqgz, + 'readsinputread=s' => \$input_read, + 'alignIntronMin=i' => \$alignIntronMin, + 'alignIntronMax=i' => \$alignIntronMax, + 'outFilterMismatchNmax=i' => \$outFilterMismatchNmax, + 'orientation=s' => \$orientation, + 'outputfile=s' => \$outputfile, + 'outputfileT=s' => \$outputfileT, + 'outputlogfinal=s' => \$outputlogfinal, + 'outputlogSJ=s' => \$outputlogSJ +) or die "Usage: Error in command line arguments\n"; + +my $cmd1 = ''; my $cmd2 =''; +my $cmd3 = ''; my $cmd4 =''; + +#STAR --runThreadN 4 --runMode genomeGenerate --genomeDir /work/smaman/TP_RNAseq/INDEX/ --genomeFastaFiles ITAG2.3_genomic_Ch6.fasta --sjdbGTFfile ITAG_pre2.3_gene_models_Ch6.gtf --sjdbOverhang 100 + +#smaman@node001 /work/smaman/TP_RNAseq $ ls -ltrah INDEX +#-rw-r--r-- 1 smaman BIOINFO 331 17 juil. 11:55 genomeParameters.txt +#-rw-r--r-- 1 smaman BIOINFO 387K 17 juil. 11:55 exonGeTrInfo.tab +#-rw-r--r-- 1 smaman BIOINFO 53K 17 juil. 11:55 geneInfo.tab +#-rw-r--r-- 1 smaman BIOINFO 151K 17 juil. 11:55 transcriptInfo.tab +#-rw-r--r-- 1 smaman BIOINFO 171K 17 juil. 11:55 exonInfo.tab +#-rw-r--r-- 1 smaman BIOINFO 325K 17 juil. 11:55 sjdbList.fromGTF.out.tab +#-rw-r--r-- 1 smaman BIOINFO 272K 17 juil. 11:55 sjdbInfo.txt +#-rw-r--r-- 1 smaman BIOINFO 325K 17 juil. 11:55 sjdbList.out.tab +#-rw-r--r-- 1 smaman BIOINFO 11 17 juil. 11:55 chrName.txt +#-rw-r--r-- 1 smaman BIOINFO 9 17 juil. 11:55 chrLength.txt +#-rw-r--r-- 1 smaman BIOINFO 11 17 juil. 11:55 chrStart.txt +#-rw-r--r-- 1 smaman BIOINFO 20 17 juil. 11:55 chrNameLength.txt +#-rw-r--r-- 1 smaman BIOINFO 47M 17 juil. 11:55 Genome +#-rw-r--r-- 1 smaman BIOINFO 360M 17 juil. 11:55 SA +#-rw-r--r-- 1 smaman BIOINFO 1,5G 17 juil. 11:55 SAindex + + +#STAR --readFilesIn WTr1.fastq WTr2.fastq --genomeDir /work/smaman/TP_RNAseq/INDEX/ --sjdbGTFfile ITAG_pre2.3_gene_models_Ch6.gtf --outSAMtype BAM SortedByCoordinate --alignIntronMin 20 --alignIntronMax 1000000 --outFilterMismatchNmax 10 --outSAMtype BAM SortedByCoordinate --runThreadN 4 --outFileNamePrefix galaxyName --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --outFilterType BySJout --quantMode TranscriptomeSAM + +#-rw-r--r-- 1 smaman BIOINFO 45M 26 mars 2015 ITAG2.3_genomic_Ch6.fasta +#-rw-r--r-- 1 smaman BIOINFO 1,6M 26 mars 2015 ITAG_pre2.3_gene_models_Ch6.gtf +#-rw-r--r-- 1 smaman BIOINFO 29 26 mars 2015 ITAG2.3_genomic_Ch6.fasta.fai +#-rw-r--r-- 1 smaman BIOINFO 614 17 juil. 10:20 WTr1.fastq +#-rw-r--r-- 1 smaman BIOINFO 589 17 juil. 10:20 WTr2.fastq +#-rw-r--r-- 1 smaman BIOINFO 14K 17 juil. 11:55 Log.out +#-rw-r--r-- 1 smaman BIOINFO 35K 17 juil. 12:03 galaxyNameAligned.toTranscriptome.out.bam +#-rw-r--r-- 1 smaman BIOINFO 637 17 juil. 12:03 galaxyNameAligned.sortedByCoord.out.bam +++++++++ +#-rw-r--r-- 1 smaman BIOINFO 0 17 juil. 12:03 galaxyNameSJ.out.tab ++++++++++++++++ +#-rw-r--r-- 1 smaman BIOINFO 246 17 juil. 12:03 galaxyNameLog.progress.out +#-rw-r--r-- 1 smaman BIOINFO 1,7K 17 juil. 12:03 galaxyNameLog.final.out +++++++++++++++ +#-rw-r--r-- 1 smaman BIOINFO 16K 17 juil. 12:03 galaxyNameLog.out + + + +#workspace +my $debug = 0; #Mode debug +if ($debug == 0) + { + print STDOUT "Debug mode OK \n"; + } +else + { + $PATH = dirname($outputfile); + print STDOUT "No debug \n"; + } + + +#Récuperer le numero (unique) de l'output afin, si besoin, de créer un répertoire de travail unique dans /work/galaxy-dev/workspace +my ($nb) = ($outputfile=~/dataset_(\d+)\.\S+$/); + +#Repertoire de sortie cree par le script, verif des droits d'ecriture sur ce repertoire de sortie +`cd $PATH/; mkdir $nb/; chmod -R 777 $nb/; cd $nb/;`; +my $dirresults= "$PATH/".$nb; + +print STDOUT "Job working directory : $dirresults \n"; + + +if ($refselector eq "ownfasta"){ + my $cmdSTARindex="(cd $dirresults/; mkdir INDEX/; chmod 777 INDEX/; $STAR --runThreadN $Nthreads --runMode genomeGenerate --genomeDir $dirresults/INDEX --genomeFastaFiles $refownfastaref --sjdbGTFfile $refowngtf --sjdbOverhang 100) >& ./out_Starindex.log 2>&1"; + system $cmdSTARindex; + #Info pour les biologistes + print STDOUT "STAR Genome Generate : \n\n $cmdSTARindex \n\n "; + $genome_path = "$dirresults/INDEX/"; +} + +my $addcuff; +if ($cufflinks eq "cuff"){ + $addcuff="--outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --outFilterType BySJout --quantMode TranscriptomeSAM "; +}else{ + $addcuff=""; +} + + +my $cat; +if ($reads_selector eq "single"){ + + my $in; + if ($compress eq "compress"){ + #Si besoin, recupération du fichier de configuration avec modification de l extension + `ln -s $input_read $dirresults/input_read.fastq.gz;`; + $in = "$dirresults/input_read.fastq.gz"; + $cat="--readFilesCommand zcat"; + }else + {`ln -s $input_read $dirresults/input_read.fastq;`; + $in = "$dirresults/input_read.fastq"; + $cat="";} + + if ($orientation eq "No"){ + $cmd1 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; + system $cmd1; + #Info pour les biologistes + print STDOUT "STAR command run on cluster without oriented reads : \n\n $cmd1 \n\n "; + } + else + { + $cmd2 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; + system $cmd2; + #Info pour les biologistes + print STDOUT "STAR command run on cluster with oriented reads : \n\n $cmd2 \n\n + Instead, you need to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has to be used only for Cufflinks runs and not for STAR runs.\n\n"; + } +}else{ + + + my $in1; + my $in2; + if ($compress eq "compress"){ + #Si besoin, recupération du fichier de configuration avec modification de l extension + `ln -s $Read1fastqgz $dirresults/Read1.fastq.gz; ln -s $Read2fastqgz $dirresults/Read2.fastq.gz;`; + $in1="$dirresults/Read1.fastq.gz"; + $in2="$dirresults/Read2.fastq.gz"; + $cat="--readFilesCommand zcat"; + }else + {`ln -s $Read1fastqgz $dirresults/Read1.fastq; ln -s $Read2fastqgz $dirresults/Read2.fastq;`; + $in1="$dirresults/Read1.fastq"; + $in2="$dirresults/Read2.fastq"; + $cat="";} + + + if ($orientation eq "No"){ + $cmd3 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in1 $in2 --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; + system $cmd3; + #Info pour les biologistes + print STDOUT "STAR command run on cluster without oriented reads : \n\n $cmd3 \n\n "; + } + else + { + $cmd4 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in1 $in2 --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; + #Info pour les biologistes + system $cmd4; + print STDOUT "STAR command run on cluster with oriented reads : \n\n $cmd4 \n\n + Instead, you need to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has to be used only for Cufflinks runs and not for STAR runs.\n\n"; + } + + +} + +#Si besoin : +#TEST 1 : command ligne on vm-galaxy +#TEST 2 perl Galaxy file : perl script.pl path/to/tests/files/used/for/galaxy/perl/script out1 + +#Recuperation des fichiers par Galaxy +#-rw-r--r-- 1 smaman BIOINFO 35K 17 juil. 12:03 galaxyNameAligned.toTranscriptome.out.bam +++++ +#-rw-r--r-- 1 smaman BIOINFO 637 17 juil. 12:03 galaxyNameAligned.sortedByCoord.out.bam +++++++++ +#-rw-r--r-- 1 smaman BIOINFO 0 17 juil. 12:03 galaxyNameSJ.out.tab ++++++++++++++++ +#-rw-r--r-- 1 smaman BIOINFO 1,7K 17 juil. 12:03 galaxyNameLog.final.out +++++++++++++++ +my $bam = glob("$dirresults/*$nb*Aligned.sortedByCoord.out.bam"); +if (! -e $bam){print STDERR "Aligned.sortedByCoord.out.bam file not found. \n";}else{`cp -a $bam $outputfile`;} +my $bamT = glob("$dirresults/*$nb*Aligned.toTranscriptome.out.bam"); +if (! -e $bamT){print STDERR "Aligned.toTranscriptome.out.bam file not found. \n";}else{`cp -a $bamT $outputfileT`;} +my $logSJ = glob("$dirresults/$nb*SJ.out.tab"); +if (! -e $logSJ){print STDERR "SJ.out.tab log file not found. \n";}else{`cp -a $logSJ $outputlogSJ`;} +my $logfinal = glob("$dirresults/$nb*Log.final.out"); +if (! -e $logfinal){print STDERR "Log.final.out log file not found. \n";}else{`cp -a $logfinal $outputlogfinal`;} +