Mercurial > repos > sarahinraauzeville > star
comparison sm_STAR2_V2.pl @ 2:80e19490ec6a draft
Uploaded
author | sarahinraauzeville |
---|---|
date | Tue, 12 Dec 2017 10:08:56 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:e8dbc8b9a59a | 2:80e19490ec6a |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 # usage : perl sm_STAR.pl <read1.fastq.gz> <read2.fastq.gz> | |
4 # 10/02/2014 - Wrapper du traitement des données RNAseq | |
5 # Sarah Maman | |
6 # Copyright (C) 2014 INRA | |
7 # This program is free software: you can redistribute it and/or modify | |
8 # it under the terms of the GNU General Public License as published by | |
9 # the Free Software Foundation, either version 3 of the License, or | |
10 # (at your option) any later version. | |
11 # | |
12 # This program is distributed in the hope that it will be useful, | |
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 # GNU General Public License for more details. | |
16 # | |
17 # You should have received a copy of the GNU General Public License | |
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 # | |
20 use strict; | |
21 use File::Basename; | |
22 use Getopt::Long; | |
23 use lib "$ENV{'MY_GALAXY_DIR'}"; | |
24 use GalaxyPath; | |
25 | |
26 my $cfg = GalaxyPath->new( -file => $ENV{"GALAXY_CONFIG_FILE"}); | |
27 my $PATH = $cfg->my_path( 'workPath', 'MYWORKSPACE' ); | |
28 my $STAR = $cfg->my_path( 'toolsPath', 'STAR_PATH' ); | |
29 | |
30 | |
31 | |
32 my $Nthreads; | |
33 my $genome_path; | |
34 my $reads_selector; | |
35 my $input_read; | |
36 my $Read1fastqgz; | |
37 my $Read2fastqgz; | |
38 my $alignIntronMin; | |
39 my $alignIntronMax; | |
40 my $outFilterMismatchNmax; | |
41 my $orientation; | |
42 my $refownfastaref; | |
43 my $refselector; | |
44 my $refowngtf; | |
45 my $compress; | |
46 my $cufflinks; | |
47 my $outputfile; | |
48 my $outputfileT; | |
49 my $outputlogSJ; | |
50 my $outputlogfinal; | |
51 | |
52 | |
53 Getopt::Long::Configure( 'no_ignorecase', 'bundling' ); | |
54 GetOptions ( | |
55 'runThreadN=i' => \$Nthreads, | |
56 'genomeDir=s' => \$genome_path, | |
57 'refselector=s' => \$refselector, | |
58 'refownfastaref=s' => \$refownfastaref, | |
59 'refowngtf=s' => \$refowngtf, | |
60 'compress=s' => \$compress, | |
61 'cufflinks=s' => \$cufflinks, | |
62 'readsselector=s'=> \$reads_selector, | |
63 'readFilesIn1=s' => \$Read1fastqgz, | |
64 'readFilesIn2=s' => \$Read2fastqgz, | |
65 'readsinputread=s' => \$input_read, | |
66 'alignIntronMin=i' => \$alignIntronMin, | |
67 'alignIntronMax=i' => \$alignIntronMax, | |
68 'outFilterMismatchNmax=i' => \$outFilterMismatchNmax, | |
69 'orientation=s' => \$orientation, | |
70 'outputfile=s' => \$outputfile, | |
71 'outputfileT=s' => \$outputfileT, | |
72 'outputlogfinal=s' => \$outputlogfinal, | |
73 'outputlogSJ=s' => \$outputlogSJ | |
74 ) or die "Usage: Error in command line arguments\n"; | |
75 | |
76 my $cmd1 = ''; my $cmd2 =''; | |
77 my $cmd3 = ''; my $cmd4 =''; | |
78 | |
79 #STAR --runThreadN 4 --runMode genomeGenerate --genomeDir /work/smaman/TP_RNAseq/INDEX/ --genomeFastaFiles ITAG2.3_genomic_Ch6.fasta --sjdbGTFfile ITAG_pre2.3_gene_models_Ch6.gtf --sjdbOverhang 100 | |
80 | |
81 #smaman@node001 /work/smaman/TP_RNAseq $ ls -ltrah INDEX | |
82 #-rw-r--r-- 1 smaman BIOINFO 331 17 juil. 11:55 genomeParameters.txt | |
83 #-rw-r--r-- 1 smaman BIOINFO 387K 17 juil. 11:55 exonGeTrInfo.tab | |
84 #-rw-r--r-- 1 smaman BIOINFO 53K 17 juil. 11:55 geneInfo.tab | |
85 #-rw-r--r-- 1 smaman BIOINFO 151K 17 juil. 11:55 transcriptInfo.tab | |
86 #-rw-r--r-- 1 smaman BIOINFO 171K 17 juil. 11:55 exonInfo.tab | |
87 #-rw-r--r-- 1 smaman BIOINFO 325K 17 juil. 11:55 sjdbList.fromGTF.out.tab | |
88 #-rw-r--r-- 1 smaman BIOINFO 272K 17 juil. 11:55 sjdbInfo.txt | |
89 #-rw-r--r-- 1 smaman BIOINFO 325K 17 juil. 11:55 sjdbList.out.tab | |
90 #-rw-r--r-- 1 smaman BIOINFO 11 17 juil. 11:55 chrName.txt | |
91 #-rw-r--r-- 1 smaman BIOINFO 9 17 juil. 11:55 chrLength.txt | |
92 #-rw-r--r-- 1 smaman BIOINFO 11 17 juil. 11:55 chrStart.txt | |
93 #-rw-r--r-- 1 smaman BIOINFO 20 17 juil. 11:55 chrNameLength.txt | |
94 #-rw-r--r-- 1 smaman BIOINFO 47M 17 juil. 11:55 Genome | |
95 #-rw-r--r-- 1 smaman BIOINFO 360M 17 juil. 11:55 SA | |
96 #-rw-r--r-- 1 smaman BIOINFO 1,5G 17 juil. 11:55 SAindex | |
97 | |
98 | |
99 #STAR --readFilesIn WTr1.fastq WTr2.fastq --genomeDir /work/smaman/TP_RNAseq/INDEX/ --sjdbGTFfile ITAG_pre2.3_gene_models_Ch6.gtf --outSAMtype BAM SortedByCoordinate --alignIntronMin 20 --alignIntronMax 1000000 --outFilterMismatchNmax 10 --outSAMtype BAM SortedByCoordinate --runThreadN 4 --outFileNamePrefix galaxyName --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --outFilterType BySJout --quantMode TranscriptomeSAM | |
100 | |
101 #-rw-r--r-- 1 smaman BIOINFO 45M 26 mars 2015 ITAG2.3_genomic_Ch6.fasta | |
102 #-rw-r--r-- 1 smaman BIOINFO 1,6M 26 mars 2015 ITAG_pre2.3_gene_models_Ch6.gtf | |
103 #-rw-r--r-- 1 smaman BIOINFO 29 26 mars 2015 ITAG2.3_genomic_Ch6.fasta.fai | |
104 #-rw-r--r-- 1 smaman BIOINFO 614 17 juil. 10:20 WTr1.fastq | |
105 #-rw-r--r-- 1 smaman BIOINFO 589 17 juil. 10:20 WTr2.fastq | |
106 #-rw-r--r-- 1 smaman BIOINFO 14K 17 juil. 11:55 Log.out | |
107 #-rw-r--r-- 1 smaman BIOINFO 35K 17 juil. 12:03 galaxyNameAligned.toTranscriptome.out.bam | |
108 #-rw-r--r-- 1 smaman BIOINFO 637 17 juil. 12:03 galaxyNameAligned.sortedByCoord.out.bam +++++++++ | |
109 #-rw-r--r-- 1 smaman BIOINFO 0 17 juil. 12:03 galaxyNameSJ.out.tab ++++++++++++++++ | |
110 #-rw-r--r-- 1 smaman BIOINFO 246 17 juil. 12:03 galaxyNameLog.progress.out | |
111 #-rw-r--r-- 1 smaman BIOINFO 1,7K 17 juil. 12:03 galaxyNameLog.final.out +++++++++++++++ | |
112 #-rw-r--r-- 1 smaman BIOINFO 16K 17 juil. 12:03 galaxyNameLog.out | |
113 | |
114 | |
115 | |
116 #workspace | |
117 my $debug = 0; #Mode debug | |
118 if ($debug == 0) | |
119 { | |
120 print STDOUT "Debug mode OK \n"; | |
121 } | |
122 else | |
123 { | |
124 $PATH = dirname($outputfile); | |
125 print STDOUT "No debug \n"; | |
126 } | |
127 | |
128 | |
129 #Récuperer le numero (unique) de l'output afin, si besoin, de créer un répertoire de travail unique dans /work/galaxy-dev/workspace | |
130 my ($nb) = ($outputfile=~/dataset_(\d+)\.\S+$/); | |
131 | |
132 #Repertoire de sortie cree par le script, verif des droits d'ecriture sur ce repertoire de sortie | |
133 `cd $PATH/; mkdir $nb/; chmod -R 777 $nb/; cd $nb/;`; | |
134 my $dirresults= "$PATH/".$nb; | |
135 | |
136 print STDOUT "Job working directory : $dirresults \n"; | |
137 | |
138 | |
139 if ($refselector eq "ownfasta"){ | |
140 my $cmdSTARindex="(cd $dirresults/; mkdir INDEX/; chmod 777 INDEX/; $STAR --runThreadN $Nthreads --runMode genomeGenerate --genomeDir $dirresults/INDEX --genomeFastaFiles $refownfastaref --sjdbGTFfile $refowngtf --sjdbOverhang 100) >& ./out_Starindex.log 2>&1"; | |
141 system $cmdSTARindex; | |
142 #Info pour les biologistes | |
143 print STDOUT "STAR Genome Generate : \n\n $cmdSTARindex \n\n "; | |
144 $genome_path = "$dirresults/INDEX/"; | |
145 } | |
146 | |
147 my $addcuff; | |
148 if ($cufflinks eq "cuff"){ | |
149 $addcuff="--outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --outFilterType BySJout --quantMode TranscriptomeSAM "; | |
150 }else{ | |
151 $addcuff=""; | |
152 } | |
153 | |
154 | |
155 my $cat; | |
156 if ($reads_selector eq "single"){ | |
157 | |
158 my $in; | |
159 if ($compress eq "compress"){ | |
160 #Si besoin, recupération du fichier de configuration avec modification de l extension | |
161 `ln -s $input_read $dirresults/input_read.fastq.gz;`; | |
162 $in = "$dirresults/input_read.fastq.gz"; | |
163 $cat="--readFilesCommand zcat"; | |
164 }else | |
165 {`ln -s $input_read $dirresults/input_read.fastq;`; | |
166 $in = "$dirresults/input_read.fastq"; | |
167 $cat="";} | |
168 | |
169 if ($orientation eq "No"){ | |
170 $cmd1 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; | |
171 system $cmd1; | |
172 #Info pour les biologistes | |
173 print STDOUT "STAR command run on cluster without oriented reads : \n\n $cmd1 \n\n "; | |
174 } | |
175 else | |
176 { | |
177 $cmd2 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; | |
178 system $cmd2; | |
179 #Info pour les biologistes | |
180 print STDOUT "STAR command run on cluster with oriented reads : \n\n $cmd2 \n\n | |
181 Instead, you need to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has to be used only for Cufflinks runs and not for STAR runs.\n\n"; | |
182 } | |
183 }else{ | |
184 | |
185 | |
186 my $in1; | |
187 my $in2; | |
188 if ($compress eq "compress"){ | |
189 #Si besoin, recupération du fichier de configuration avec modification de l extension | |
190 `ln -s $Read1fastqgz $dirresults/Read1.fastq.gz; ln -s $Read2fastqgz $dirresults/Read2.fastq.gz;`; | |
191 $in1="$dirresults/Read1.fastq.gz"; | |
192 $in2="$dirresults/Read2.fastq.gz"; | |
193 $cat="--readFilesCommand zcat"; | |
194 }else | |
195 {`ln -s $Read1fastqgz $dirresults/Read1.fastq; ln -s $Read2fastqgz $dirresults/Read2.fastq;`; | |
196 $in1="$dirresults/Read1.fastq"; | |
197 $in2="$dirresults/Read2.fastq"; | |
198 $cat="";} | |
199 | |
200 | |
201 if ($orientation eq "No"){ | |
202 $cmd3 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in1 $in2 --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; | |
203 system $cmd3; | |
204 #Info pour les biologistes | |
205 print STDOUT "STAR command run on cluster without oriented reads : \n\n $cmd3 \n\n "; | |
206 } | |
207 else | |
208 { | |
209 $cmd4 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in1 $in2 --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; | |
210 #Info pour les biologistes | |
211 system $cmd4; | |
212 print STDOUT "STAR command run on cluster with oriented reads : \n\n $cmd4 \n\n | |
213 Instead, you need to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has to be used only for Cufflinks runs and not for STAR runs.\n\n"; | |
214 } | |
215 | |
216 | |
217 } | |
218 | |
219 #Si besoin : | |
220 #TEST 1 : command ligne on vm-galaxy | |
221 #TEST 2 perl Galaxy file : perl script.pl path/to/tests/files/used/for/galaxy/perl/script out1 | |
222 | |
223 #Recuperation des fichiers par Galaxy | |
224 #-rw-r--r-- 1 smaman BIOINFO 35K 17 juil. 12:03 galaxyNameAligned.toTranscriptome.out.bam +++++ | |
225 #-rw-r--r-- 1 smaman BIOINFO 637 17 juil. 12:03 galaxyNameAligned.sortedByCoord.out.bam +++++++++ | |
226 #-rw-r--r-- 1 smaman BIOINFO 0 17 juil. 12:03 galaxyNameSJ.out.tab ++++++++++++++++ | |
227 #-rw-r--r-- 1 smaman BIOINFO 1,7K 17 juil. 12:03 galaxyNameLog.final.out +++++++++++++++ | |
228 my $bam = glob("$dirresults/*$nb*Aligned.sortedByCoord.out.bam"); | |
229 if (! -e $bam){print STDERR "Aligned.sortedByCoord.out.bam file not found. \n";}else{`cp -a $bam $outputfile`;} | |
230 my $bamT = glob("$dirresults/*$nb*Aligned.toTranscriptome.out.bam"); | |
231 if (! -e $bamT){print STDERR "Aligned.toTranscriptome.out.bam file not found. \n";}else{`cp -a $bamT $outputfileT`;} | |
232 my $logSJ = glob("$dirresults/$nb*SJ.out.tab"); | |
233 if (! -e $logSJ){print STDERR "SJ.out.tab log file not found. \n";}else{`cp -a $logSJ $outputlogSJ`;} | |
234 my $logfinal = glob("$dirresults/$nb*Log.final.out"); | |
235 if (! -e $logfinal){print STDERR "Log.final.out log file not found. \n";}else{`cp -a $logfinal $outputlogfinal`;} | |
236 |