Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/prada-preprocess-unc @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 #This is a program to implement the PRADA pipeline "process subsection", based on the work of Rahul Vegesna and Wandaliz Torres-Garcia | |
| 4 #It merely generates a PBS file, depending on the user input entry point (step) | |
| 5 #Author: Siyuan Zheng, szheng2@mdanderson.org | |
| 6 #Copy right belongs to Roel Verhaak's lab from MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology. | |
| 7 #Last revision: 02/17/2014 | |
| 8 | |
| 9 import subprocess | |
| 10 import os,os.path | |
| 11 import sys | |
| 12 import time | |
| 13 import ioprada | |
| 14 import re | |
| 15 | |
| 16 ######################################################################################## | |
| 17 args=sys.argv | |
| 18 | |
| 19 help_menu='''\nPipeline for RNAseq Data Analaysis - preprocessing pipeline (PRADA). | |
| 20 \t**Usage**: | |
| 21 \tprada-preprocess-bi -conf xx.txt -inputdir .. -sample XX -tag TCGA-XX -platform illumina -step 1_1 -intermediate no -pbs xxx -outdir ... -submit no | |
| 22 \t**Parameters**: | |
| 23 \t-h print help message | |
| 24 \t-step_info print complete steps curated in the module. | |
| 25 \t-inputdir the dir where the input bam or fastq can be found. | |
| 26 \t-sample input sample name. PRADA searches for sample.bam or sample.end1.fastq/sample.end2.fastq, etc, depending on the | |
| 27 \t initiating step number. See step_info for more information. | |
| 28 \t-conf config file for references and parameters. Default is conf.txt in py-PRADA installation folder. | |
| 29 \t-tag a tag to describe the sample, likely sample ID, such as TCGA-LGG-01; no default. | |
| 30 \t-platform only illumina at present (default). | |
| 31 \t-step values: 1_1/2,2_e1/2_1/2/3/4,3_e1/2_1/2,4_1/2,5,6_1/2,7,8; example 2_e1_1; no default. | |
| 32 \t-outdir output dir. Default is the directory where the input bam is. | |
| 33 \t-pbs name for output pbs file and log file. Default (time-stamp) is used if no input. | |
| 34 \t-intermediate values:yes/no; if intermediate files should be kept. Default is not. | |
| 35 \t-submit if submit the job to HPC, default is no. If yes, ppn is set to 12. | |
| 36 \t-v print version information. | |
| 37 ''' | |
| 38 | |
| 39 steps_info=''' | |
| 40 Command orders (sample XX) | |
| 41 step 1_1 --> XX.sorted.bam [sort input bam by name] | |
| 42 step 1_2 --> XX.end1/2.fastq [extract reads from bam] | |
| 43 step 2_e1_1 --> XX.end1.sai [realign end1 reads to composite reference] | |
| 44 step 2_e1_2 --> XX.end1.sam [generate sam] | |
| 45 step 2_e1_3 --> XX.end1.bam [generate bam] | |
| 46 step 2_e1_4 --> XX.end1.sorted.bam [sort bam] | |
| 47 step 2_e2_1 --> XX.end2.sai [realign end2 reads to composite reference] | |
| 48 step 2_e2_2 --> XX.end2.sam [generate sam] | |
| 49 step 2_e2_3 --> XX.end2.bam [generate bam] | |
| 50 step 2_e2_4 --> XX.end2.sorted.bam [sort bam] | |
| 51 step 3_e1_1 --> XX.end1.remapped.bam [remap end1 to genome] | |
| 52 step 3_e1_2 --> XX.end1.remapped.sorted.bam [sort remapped bam] | |
| 53 step 3_e2_1 --> XX.end2.remapped.bam [remap end2 to genome] | |
| 54 step 3_e2_2 --> XX.end2.remapped.sorted.bam [sort remapped bam] | |
| 55 step 4_1 --> XX.paired.bam [pair end1 and end1] | |
| 56 step 4_2 --> XX.paired.sorted.bam [sort paired bam] | |
| 57 step 5 --> XX.withRG.paired.sorted.bam [add read group] | |
| 58 step 6_1 --> XX.orig.csv [prepair recalibration table] | |
| 59 step 6_2 --> XX.withRG.GATKRecalibrated.bam [recalibration] | |
| 60 step 7 --> XX.withRG.GATKRecalibrated.flagged.bam [flag duplication reads] | |
| 61 step 8 --> folder XX for gene expression, QC metrics etc. [generate QC and expression] | |
| 62 ''' | |
| 63 | |
| 64 if '-h' in args or '-help' in args or len(args)==1: | |
| 65 print help_menu | |
| 66 sys.exit(0) | |
| 67 if '-step_info' in args: | |
| 68 print steps_info | |
| 69 sys.exit(0) | |
| 70 if '-v' in args: | |
| 71 import version | |
| 72 print version.version | |
| 73 sys.exit(0) | |
| 74 | |
| 75 if '-sample' not in args: | |
| 76 sys.exit('ERROR: Sample name is needed') | |
| 77 if '-step' not in args: | |
| 78 sys.exit('ERROR: Step number is needed') | |
| 79 if '-tag' not in args: | |
| 80 sys.exit('ERROR: A tag is needed') | |
| 81 | |
| 82 i=args.index('-sample') | |
| 83 sample=args[i+1] | |
| 84 if '-inputdir' not in args: | |
| 85 inputpath=os.path.abspath('./') | |
| 86 else: | |
| 87 i=args.index('-inputdir') | |
| 88 inputpath=args[i+1] | |
| 89 bampath=os.path.abspath(inputpath)+'/%s.bam'%sample | |
| 90 | |
| 91 if '-outdir' not in args: | |
| 92 outpath=os.path.dirname(bampath) | |
| 93 else: | |
| 94 i=args.index('-outdir') | |
| 95 outpath=os.path.abspath(args[i+1]) | |
| 96 if not os.path.exists(outpath): | |
| 97 os.mkdir(outpath) | |
| 98 | |
| 99 #bam=os.path.basename(bampath) | |
| 100 #sample=bam[:-4] | |
| 101 i=args.index('-step') | |
| 102 step=args[i+1] | |
| 103 i=args.index('-tag') | |
| 104 tag=args[i+1] | |
| 105 | |
| 106 prada_path=os.path.dirname(os.path.abspath(__file__)) #### | |
| 107 ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line | |
| 108 | |
| 109 if '-conf' in args: | |
| 110 i=args.index('-conf') | |
| 111 reffile=args[i+1] | |
| 112 if os.path.exists(reffile): | |
| 113 pass | |
| 114 else: | |
| 115 for pth in ref_search_path: | |
| 116 new_reffile='%s/%s'%(pth, os.path.basename(reffile)) | |
| 117 if os.path.exists(new_reffile): | |
| 118 reffile=new_reffile | |
| 119 break | |
| 120 else: | |
| 121 sys.exit('ERROR: conf file %s not found'%reffile) | |
| 122 else: | |
| 123 reffile='%s/conf.txt'%prada_path | |
| 124 if not os.path.exists(reffile): | |
| 125 sys.exit('ERROR: No default conf.txt found and none specified') | |
| 126 | |
| 127 if '-platform' in args: | |
| 128 i=args.index('-platform') | |
| 129 plat=args[i+1] | |
| 130 else: | |
| 131 plat='illumina' | |
| 132 if '-submit' in args: | |
| 133 i=args.index('-submit') | |
| 134 submit=args[i+1] | |
| 135 else: | |
| 136 submit='no' | |
| 137 if '-intermediate' in args: | |
| 138 i=args.index('-intermediate') | |
| 139 keepmed=args[i+1] | |
| 140 else: | |
| 141 keepmed='False' | |
| 142 | |
| 143 if keepmed in ['False','FALSE','false','F','NO','No','no','n']: | |
| 144 keepmed_flag='no' | |
| 145 elif keepmed in ['True','TRUE','true','T','YES','Yes','yes','y']: | |
| 146 keepmed_flag='yes' | |
| 147 else: | |
| 148 sys.exit('ERROR: -intermediate value not recognized') | |
| 149 | |
| 150 if '-pbs' in args: | |
| 151 i=args.index('-pbs') | |
| 152 docstr=args[i+1] | |
| 153 else: | |
| 154 a=time.ctime().split() | |
| 155 b=time.time() | |
| 156 timestamp='_'.join([a[-1],a[1],a[2]])+'.'+str(b) | |
| 157 docstr='prada_prep_'+timestamp | |
| 158 logfilename=docstr+'.log' | |
| 159 pbsfilename=docstr+'.pbs' | |
| 160 pbspath=outpath+'/'+pbsfilename | |
| 161 logpath=outpath+'/'+logfilename | |
| 162 ######################################################################################## | |
| 163 | |
| 164 ######################################################################################## | |
| 165 #underlying utilities, automatically detected | |
| 166 samtools='%s/tools/samtools-0.1.16/samtools'%prada_path | |
| 167 bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path | |
| 168 gatk='%s/tools/GATK/'%prada_path | |
| 169 picard='%s/tools/Picard/'%prada_path | |
| 170 seqc='%s/tools/RNA-SeQC_v1.1.7.jar'%prada_path | |
| 171 sam2fastq='%s/tools/ubu-1.2-jar-with-dependencies.jar'%prada_path | |
| 172 #Default uses 12 nodes in HPC | |
| 173 ######################################################################################### | |
| 174 | |
| 175 ######################################################################################### | |
| 176 #reference files | |
| 177 refdict=ioprada.read_conf(reffile) | |
| 178 genome_gtf=refdict['--REF--']['genome_gtf'] | |
| 179 compdb_fasta=refdict['--REF--']['compdb_fasta'] | |
| 180 compdb_map=refdict['--REF--']['compdb_map'] | |
| 181 genome_fasta=refdict['--REF--']['genome_fasta'] | |
| 182 dbsnp_vcf=refdict['--REF--']['dbsnp_vcf'] | |
| 183 select_tx=refdict['--REF--']['select_tx'] | |
| 184 pat=re.compile('ppn=(\d*)') | |
| 185 parallel_n=pat.search(refdict['--PBS--']['-l']).groups()[0] | |
| 186 ######################################################################################### | |
| 187 | |
| 188 ######################################################################################### | |
| 189 #pipeline command lines. | |
| 190 ##Cleaning up steps: if -intermediate is yes, none will be executed. | |
| 191 step_1_1_cmd=['%s sort -n -m 1000000000 %s %s.sorted'%(samtools,bampath,sample)] | |
| 192 post_1_1_clean=[] | |
| 193 step_1_2_cmd=['java -Xmx8g -jar %s INPUT=%s.sorted.bam FASTQ=%s.end1.fastq SECOND_END_FASTQ=%s.end2.fastq INCLUDE_NON_PF_READS=true VALIDATION_STRINGENCY=SILENT TMP_DIR=tmp/'%(sam2fastq,sample,sample,sample)] | |
| 194 post_1_2_clean=['rm -f %s.sorted.bam'%sample] | |
| 195 alnparams=' '.join([' '.join(x) for x in refdict['--BWA aln--'].items()]) | |
| 196 samseparams=' '.join([' '.join(x) for x in refdict['--BWA samse--'].items()]) | |
| 197 if step == '2_e1_1' or step == '2_e2_1': | |
| 198 step_2_e1_1_cmd=['%s aln %s %s %s > %s.end1.sai'%(bwa,alnparams,compdb_fasta,fq1path,sample)] | |
| 199 post_2_e1_1_clean=[] | |
| 200 step_2_e1_2_cmd=['%s samse -s %s %s %s.end1.sai %s > %s.end1.sam'%(bwa,samseparams,compdb_fasta,sample,fq1path,sample)] | |
| 201 post_2_e1_2_clean=['rm -f %s.end1.sai'%sample] | |
| 202 step_2_e2_1_cmd=['%s aln %s %s %s > %s.end2.sai'%(bwa,alnparams,compdb_fasta,fq2path,sample)] | |
| 203 post_2_e2_1_clean=[] | |
| 204 step_2_e2_2_cmd=['%s samse -s %s %s %s.end2.sai %s > %s.end2.sam'%(bwa,samseparams,compdb_fasta,sample,fq2path,sample)] | |
| 205 post_2_e2_2_clean=['rm -f %s.end2.sai'%sample] | |
| 206 else: | |
| 207 step_2_e1_1_cmd=['%s aln %s %s %s.end1.fastq > %s.end1.sai'%(bwa,alnparams,compdb_fasta,sample,sample)] | |
| 208 post_2_e1_1_clean=[] | |
| 209 step_2_e1_2_cmd=['%s samse -s %s %s %s.end1.sai %s.end1.fastq > %s.end1.sam'%(bwa,samseparams,compdb_fasta,sample,sample,sample)] | |
| 210 post_2_e1_2_clean=['rm -f %s.end1.sai'%sample,'rm -f %s.end1.fastq'%sample] | |
| 211 step_2_e2_1_cmd=['%s aln %s %s %s.end2.fastq > %s.end2.sai'%(bwa,alnparams,compdb_fasta,sample,sample)] | |
| 212 post_2_e2_1_clean=[] | |
| 213 step_2_e2_2_cmd=['%s samse -s %s %s %s.end2.sai %s.end2.fastq > %s.end2.sam'%(bwa,samseparams,compdb_fasta,sample,sample,sample)] | |
| 214 post_2_e2_2_clean=['rm -f %s.end2.sai'%sample,'rm -f %s.end2.fastq'%sample] | |
| 215 step_2_e1_3_cmd=['%s view -bS -o %s.end1.bam %s.end1.sam'%(samtools,sample,sample)] | |
| 216 post_2_e1_3_clean=['rm -f %s.end1.sam'%sample] | |
| 217 step_2_e1_4_cmd=['%s sort -n -m 1000000000 %s.end1.bam %s.end1.sorted'%(samtools,sample,sample)] | |
| 218 post_2_e1_4_clean=['rm -f %s.end1.bam'%sample] | |
| 219 step_2_e2_3_cmd=['%s view -bS -o %s.end2.bam %s.end2.sam'%(samtools,sample,sample)] | |
| 220 post_2_e2_3_clean=['rm -f %s.end2.sam'%sample] | |
| 221 step_2_e2_4_cmd=['%s sort -n -m 1000000000 %s.end2.bam %s.end2.sorted'%(samtools,sample,sample)] | |
| 222 post_2_e2_4_clean=['rm -f %s.end2.bam'%sample] | |
| 223 step_3_e1_1_cmd=['java -Djava.io.tmpdir=tmp/ -cp %s/RemapAlignments.jar -Xmx8g org.broadinstitute.cga.tools.gatk.rna.RemapAlignments M=%s IN=%s.end1.sorted.bam OUT=%s.end1.remapped.bam R=%s REDUCE=TRUE'%(gatk,compdb_map,sample,sample,genome_fasta)] | |
| 224 post_3_e1_1_clean=['rm -f %s.end1.sorted.bam'%sample] | |
| 225 step_3_e1_2_cmd=['%s sort -n -m 1000000000 %s.end1.remapped.bam %s.end1.remapped.sorted'%(samtools,sample,sample)] | |
| 226 post_3_e1_2_clean=['rm -f %s.end1.remapped.bam'%sample] | |
| 227 step_3_e2_1_cmd=['java -Djava.io.tmpdir=tmp/ -cp %s/RemapAlignments.jar -Xmx8g org.broadinstitute.cga.tools.gatk.rna.RemapAlignments M=%s IN=%s.end2.sorted.bam OUT=%s.end2.remapped.bam R=%s REDUCE=TRUE'%(gatk,compdb_map,sample,sample,genome_fasta)] | |
| 228 post_3_e2_1_clean=['rm -f %s.end2.sorted.bam'%sample] | |
| 229 step_3_e2_2_cmd=['%s sort -n -m 1000000000 %s.end2.remapped.bam %s.end2.remapped.sorted'%(samtools,sample,sample)] | |
| 230 post_3_e2_2_clean=['rm -f %s.end2.remapped.bam'%sample] | |
| 231 step_4_1_cmd=['java -Djava.io.tmpdir=tmp/ -Xmx8g -jar %s/PairMaker.jar IN1=%s.end1.remapped.sorted.bam IN2=%s.end2.remapped.sorted.bam OUTPUT=%s.paired.bam TMP_DIR=tmp/'%(gatk,sample,sample,sample)] | |
| 232 post_4_1_clean=['rm -f %s.end1.remapped.sorted.bam'%sample,'rm -f %s.end2.remapped.sorted.bam'%sample] | |
| 233 step_4_2_cmd=['%s sort -m 1000000000 %s.paired.bam %s.paired.sorted'%(samtools,sample,sample)] | |
| 234 post_4_2_clean=['rm -f %s.paired.bam'%sample] | |
| 235 step_5_cmd=['java -Xmx8g -jar %s/AddOrReplaceReadGroups.jar I=%s.paired.sorted.bam O=%s.withRG.paired.sorted.bam RGLB=%s RGPL=%s RGPU=%s RGSM=%s'%(picard,sample,sample,tag,plat,tag,tag)] | |
| 236 post_5_clean=['rm -f %s.paired.sorted.bam'%sample] | |
| 237 step_6_1_cmd=['%s index %s.withRG.paired.sorted.bam'%(samtools,sample),'java -Xmx8g -jar %s/GenomeAnalysisTK.jar -l INFO -R %s --default_platform %s --knownSites %s -I %s.withRG.paired.sorted.bam --downsample_to_coverage 10000 -T CountCovariates -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate -nt %s -recalFile %s.orig.csv'%(gatk,genome_fasta,plat,dbsnp_vcf,sample,parallel_n,sample)] | |
| 238 post_6_1_clean=[] | |
| 239 step_6_2_cmd=['java -Xmx8g -jar %s/GenomeAnalysisTK.jar -l INFO -R %s --default_platform %s -I %s.withRG.paired.sorted.bam -T TableRecalibration --out %s.withRG.GATKRecalibrated.bam -recalFile %s.orig.csv'%(gatk,genome_fasta,plat,sample,sample,sample)] | |
| 240 post_6_2_clean=['rm -f %s.withRG.paired.sorted.bam'%sample,'rm -f %s.withRG.paired.sorted.bam.bai'%sample,'rm -f %s.orig.csv'%sample] | |
| 241 step_7_cmd=['java -Xmx8g -jar %s/MarkDuplicates.jar I=%s.withRG.GATKRecalibrated.bam O=%s.withRG.GATKRecalibrated.flagged.bam METRICS_FILE=%s.Duplicates_metrics.txt VALIDATION_STRINGENCY=SILENT TMP_DIR=tmp/'%(picard,sample,sample,sample),'%s index %s.withRG.GATKRecalibrated.flagged.bam'%(samtools,sample)] | |
| 242 post_7_clean=['rm -f %s.withRG.GATKRecalibrated.bam'%sample,'rm -f %s.withRG.GATKRecalibrated.bai'%sample,'rm -f %s.Duplicates_metrics.txt'%sample] | |
| 243 step_8_cmd=["java -Xmx8g -jar %s -ttype 2 -t %s -r %s -s '%s|%s.withRG.GATKRecalibrated.flagged.bam|Disc' -o %s/"%(seqc,genome_gtf,genome_fasta,sample,sample,sample)] | |
| 244 post_8_clean=[] | |
| 245 | |
| 246 cmdset=[] | |
| 247 for item in globals().keys(): | |
| 248 if item.endswith('_cmd'): | |
| 249 cmdset.append(item) | |
| 250 cmdset.sort() | |
| 251 | |
| 252 ######################################################################################### | |
| 253 #write PBS file | |
| 254 | |
| 255 ###############Entry Point | |
| 256 try: | |
| 257 cmd_entry=cmdset.index('step_'+step+'_cmd') ##entry point | |
| 258 except ValueError: | |
| 259 sys.exit('ERROR: STEP not recognized') | |
| 260 | |
| 261 def _parsecmd(cmd): | |
| 262 '''pase cmd str for step information''' | |
| 263 info=cmd.split('_') | |
| 264 cstep='_'.join(info[1:-1]) | |
| 265 return cstep | |
| 266 | |
| 267 ########################### | |
| 268 ##headers | |
| 269 outfile=open(pbspath,'w') | |
| 270 outfile.write('#! /bin/sh\n') | |
| 271 outfile.write('#PBS -V\n') | |
| 272 outfile.write('#PBS -N %s\n'%tag) | |
| 273 outfile.write('#PBS -j oe\n') | |
| 274 outfile.write('#PBS -o %s\n'%logpath) | |
| 275 for item in refdict['--PBS--']: | |
| 276 outfile.write('#PBS %s %s\n'%(item,refdict['--PBS--'][item])) | |
| 277 outfile.write('#PBS -d %s\n'%outpath) | |
| 278 | |
| 279 #commands | |
| 280 outfile.write('echo "Job start: `date`"\n') | |
| 281 | |
| 282 for i in range(cmd_entry,len(cmdset)): | |
| 283 cmd=cmdset[i] | |
| 284 cstep=_parsecmd(cmd) | |
| 285 clean_flag=1 | |
| 286 outfile.write('echo "step %s start: `date`"\n'%cstep) | |
| 287 outfile.write('if\n') | |
| 288 outfile.write('\t%s\n'%('\n'.join(eval(cmd)))) | |
| 289 outfile.write('then\n') | |
| 290 outfile.write('\techo "step %s done: `date`"\n'%cstep) | |
| 291 outfile.write('else\n') | |
| 292 outfile.write('\techo "step %s ERROR"\n'%cstep) | |
| 293 outfile.write('\texit\n') | |
| 294 outfile.write('fi\n') | |
| 295 | |
| 296 if keepmed_flag=='yes': #if so, keep files by force | |
| 297 clean_flag=0 | |
| 298 | |
| 299 if clean_flag==1: | |
| 300 outfile.write('\n'.join(eval('post_%s_clean'%cstep))) | |
| 301 outfile.write('\n') | |
| 302 | |
| 303 outfile.write('echo "PIPELINE FINISHED"\n') | |
| 304 outfile.close() | |
| 305 ###################################################################### | |
| 306 | |
| 307 if submit in ['False','FALSE','false','F','NO','No','no','n']: | |
| 308 jid='Not_Submitted' | |
| 309 logpath='None' | |
| 310 elif submit in ['True','TRUE','true','T','YES','Yes','yes','y']: | |
| 311 cmdstr='qsub %s'%pbspath | |
| 312 cmd=cmdstr.split() | |
| 313 cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) | |
| 314 jid=cmdout.stdout.read().strip() ##JOB ID | |
| 315 else: | |
| 316 sys.exit('ERROR: submit parameter not recognized') | |
| 317 | |
| 318 print '#!#%s'%tag | |
| 319 print 'BAM\t%s'%bampath | |
| 320 print 'Entry\t%s'%step | |
| 321 print 'PBS\t%s'%pbspath | |
| 322 print 'JOB\t%s'%jid | |
| 323 print 'LOG\t%s'%logpath | |
| 324 |
