Mercurial > repos > yufei-luo > s_mart
comparison SMART/DiffExpAnal/tophat_parallel_unSQL.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 | |
| 2 #!/usr/bin/env python | |
| 3 | |
| 4 import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob | |
| 5 from commons.core.launcher.Launcher import Launcher | |
| 6 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory | |
| 7 from commons.core.utils.FileUtils import FileUtils | |
| 8 | |
| 9 def stop_err( msg ): | |
| 10 sys.stderr.write( "%s\n" % msg ) | |
| 11 sys.exit() | |
| 12 | |
| 13 def toTar(tarFileName, accepted_hits_outputNames): | |
| 14 tfile = tarfile.open(tarFileName + ".tmp.tar", "w") | |
| 15 currentPath = os.getcwd() | |
| 16 os.chdir(dir) | |
| 17 for file in accepted_hits_outputNames: | |
| 18 relativeFileName = os.path.basename(file) | |
| 19 tfile.add(relativeFileName) | |
| 20 os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName)) | |
| 21 tfile.close() | |
| 22 os.chdir(currentPath) | |
| 23 | |
| 24 def splitFastQ(fileName, nbOfSeqPerBatch): | |
| 25 nbOfLinesPerFile = nbOfSeqPerBatch * 4 | |
| 26 lOutput = [] | |
| 27 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) | |
| 28 resDir = os.path.dirname(fileName) | |
| 29 with open(fileName) as inF: | |
| 30 fileNb = 1 | |
| 31 line = inF.readline() | |
| 32 if not line or nbOfLinesPerFile == 0: | |
| 33 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) | |
| 34 lOutput.append(outFileName) | |
| 35 f = open(outFileName, "wb") | |
| 36 shutil.copyfileobj(open(fileName, "rb"), f) | |
| 37 f.close() | |
| 38 else: | |
| 39 while line: | |
| 40 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) | |
| 41 lOutput.append(outFileName) | |
| 42 with open(outFileName, "w") as outF: | |
| 43 lineNb = 1 | |
| 44 while lineNb <= nbOfLinesPerFile and line: | |
| 45 outF.write(line) | |
| 46 line = inF.readline() | |
| 47 lineNb += 1 | |
| 48 fileNb += 1 | |
| 49 return lOutput | |
| 50 | |
| 51 def joinBAM(dCutOut2Out): | |
| 52 for key in dCutOut2Out.keys(): | |
| 53 fh = open(key, "w") | |
| 54 fh.close() | |
| 55 nbFile = 0 | |
| 56 cmd = "samtools merge -f %s" % key | |
| 57 for fileName in dCutOut2Out[key]: | |
| 58 nbFile = nbFile + 1 | |
| 59 if nbFile < 225: | |
| 60 cmd += " %s" % fileName | |
| 61 else: | |
| 62 nbFile = 0 | |
| 63 cmd += ";mv %s tmpBAM;" % (key) | |
| 64 cmd += "samtools merge -f %s tmpBAM %s" % (key, fileName) | |
| 65 proc = subprocess.Popen( args=cmd , shell=True) | |
| 66 returncode = proc.wait() | |
| 67 | |
| 68 | |
| 69 def _map(iLauncher, cmd, cmdStart, cmdFinish ): | |
| 70 lCmds = [] | |
| 71 lCmds.extend(cmd) | |
| 72 lCmdStart = [] | |
| 73 lCmdStart.extend(cmdStart) | |
| 74 lCmdFinish = [] | |
| 75 lCmdFinish.extend(cmdFinish) | |
| 76 return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) | |
| 77 | |
| 78 def _createTopHatCommand(iLauncher, options, index_paths, inputFileNames, inputRevFilesNames, space): | |
| 79 lArgs = [] | |
| 80 lArgs.append('-p %s %s' % ( options.num_threads, space )) | |
| 81 if options.single_paired == 'paired': | |
| 82 lArgs.append('-r %s ' % options.mate_inner_dist) | |
| 83 if options.settings == 'preSet': | |
| 84 lArgs.append(index_paths) | |
| 85 lArgs.append(inputFileNames) | |
| 86 if options.input2: | |
| 87 lArgs.append(inputRevFilesNames) | |
| 88 return iLauncher.getSystemCommand("tophat", lArgs) | |
| 89 else: | |
| 90 if int( options.min_anchor_length ) >= 3: | |
| 91 lArgs.append('-a %s ' % options.min_anchor_length) | |
| 92 else: | |
| 93 raise Exception, 'Minimum anchor length must be 3 or greater' | |
| 94 lArgs.append('-m %s ' % options.splice_mismatches) | |
| 95 lArgs.append('-i %s ' % options.min_intron_length) | |
| 96 lArgs.append('-I %s ' % options.max_intron_length) | |
| 97 if float( options.junction_filter ) != 0.0: | |
| 98 lArgs.append('-F %s ' % options.junction_filter) | |
| 99 lArgs.append('-g %s ' % options.max_multihits) | |
| 100 # Custom junctions options. | |
| 101 if options.gene_model_annotations: | |
| 102 lArgs.append('-G %s ' % options.gene_model_annotations) | |
| 103 if options.raw_juncs: | |
| 104 lArgs.append('-j %s ' % options.raw_juncs) | |
| 105 if options.no_novel_juncs: | |
| 106 lArgs.append('--no-novel-juncs ') | |
| 107 if options.library_type: | |
| 108 lArgs.append('--library-type %s ' % options.library_type) | |
| 109 if options.no_novel_indels: | |
| 110 lArgs.append('--no-novel-indels ') | |
| 111 else: | |
| 112 if options.max_insertion_length: | |
| 113 lArgs.append('--max-insertion-length %i ' % int( options.max_insertion_length )) | |
| 114 if options.max_deletion_length: | |
| 115 lArgs.append('--max-deletion-length %i ' % int( options.max_deletion_length )) | |
| 116 # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1) | |
| 117 # need to warn user of this fact | |
| 118 #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" ) | |
| 119 | |
| 120 # Search type options. | |
| 121 if options.coverage_search: | |
| 122 lArgs.append('--coverage-search --min-coverage-intron %s --max-coverage-intron %s ' % ( options.min_coverage_intron, options.max_coverage_intron )) | |
| 123 else: | |
| 124 lArgs.append('--no-coverage-search ') | |
| 125 if options.closure_search: | |
| 126 lArgs.append('--closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s ' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron )) | |
| 127 else: | |
| 128 lArgs.append('--no-closure-search ') | |
| 129 if options.microexon_search: | |
| 130 lArgs.append('--microexon-search ') | |
| 131 if options.single_paired == 'paired': | |
| 132 lArgs.append('--mate-std-dev %s ' % options.mate_std_dev) | |
| 133 if options.initial_read_mismatches: | |
| 134 lArgs.append('--initial-read-mismatches %d ' % int( options.initial_read_mismatches )) | |
| 135 if options.seg_mismatches: | |
| 136 lArgs.append('--segment-mismatches %d ' % int( options.seg_mismatches )) | |
| 137 if options.seg_length: | |
| 138 lArgs.append('--segment-length %d ' % int( options.seg_length )) | |
| 139 if options.min_segment_intron: | |
| 140 lArgs.append('--min-segment-intron %d ' % int( options.min_segment_intron )) | |
| 141 if options.max_segment_intron: | |
| 142 lArgs.append('--max-segment-intron %d ' % int( options.max_segment_intron )) | |
| 143 lArgs.append(index_paths) | |
| 144 lArgs.append(inputFileNames) | |
| 145 if options.input2: | |
| 146 lArgs.append(inputRevFilesNames) | |
| 147 return iLauncher.getSystemCommand("tophat", lArgs) | |
| 148 | |
| 149 | |
| 150 | |
| 151 def __main__(): | |
| 152 #Parse Command Line | |
| 153 parser = optparse.OptionParser() | |
| 154 parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.') | |
| 155 parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' ) | |
| 156 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) | |
| 157 parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' ) | |
| 158 parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', default='junctions_output.bed', help='Junctions output file; formate is BED.' ) | |
| 159 parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', default='hits_output.bam', help='Accepted hits output file; formate is BAM.' ) | |
| 160 parser.add_option( '', '--own-file', dest='own_file', help='' ) | |
| 161 parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | |
| 162 parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \ | |
| 163 For, example, for paired end runs with fragments selected at 300bp, \ | |
| 164 where each end is 50bp, you should set -r to be 200. There is no default, \ | |
| 165 and this parameter is required for paired end runs.') | |
| 166 parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' ) | |
| 167 parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', | |
| 168 help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' ) | |
| 169 parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' ) | |
| 170 parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', | |
| 171 help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' ) | |
| 172 parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', | |
| 173 help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' ) | |
| 174 parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' ) | |
| 175 parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' ) | |
| 176 parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' ) | |
| 177 parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' ) | |
| 178 parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' ) | |
| 179 parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' ) | |
| 180 parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' ) | |
| 181 parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' ) | |
| 182 parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' ) | |
| 183 | |
| 184 # Options for supplying own junctions | |
| 185 parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \ | |
| 186 TopHat will use the exon records in this file to build \ | |
| 187 a set of known splice junctions for each gene, and will \ | |
| 188 attempt to align reads to these junctions even if they \ | |
| 189 would not normally be covered by the initial mapping.') | |
| 190 parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \ | |
| 191 specified one per line, in a tab-delimited format. Records \ | |
| 192 look like: <chrom> <left> <right> <+/-> left and right are \ | |
| 193 zero-based coordinates, and specify the last character of the \ | |
| 194 left sequenced to be spliced to the first character of the right \ | |
| 195 sequence, inclusive.') | |
| 196 parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \ | |
| 197 supplied GFF file. (ignored without -G)") | |
| 198 parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.") | |
| 199 # Types of search. | |
| 200 parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.') | |
| 201 parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)') | |
| 202 parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' ) | |
| 203 parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.') | |
| 204 parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' ) | |
| 205 parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' ) | |
| 206 parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' ) | |
| 207 parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' ) | |
| 208 parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' ) | |
| 209 parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' ) | |
| 210 parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' ) | |
| 211 parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' ) | |
| 212 | |
| 213 # Wrapper options. | |
| 214 parser.add_option( '-1', '--input1', dest='input1', help='A list of the (forward or single-end) reads files of Sanger FASTQ format, txt format' ) | |
| 215 parser.add_option( '-2', '--input2', dest='input2', help='The list of reverse reads file in Sanger FASTQ format' ) | |
| 216 parser.add_option( '', '--single-paired', dest='single_paired', help='' ) | |
| 217 parser.add_option( '', '--settings', dest='settings', help='' ) | |
| 218 | |
| 219 (options, args) = parser.parse_args() | |
| 220 | |
| 221 # output version # of tool | |
| 222 try: | |
| 223 tmp_files = [] | |
| 224 tmp = tempfile.NamedTemporaryFile().name | |
| 225 tmp_files.append(tmp) | |
| 226 tmp_stdout = open( tmp, 'wb' ) | |
| 227 proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout ) | |
| 228 tmp_stdout.close() | |
| 229 returncode = proc.wait() | |
| 230 stdout = open( tmp_stdout.name, 'rb' ).readline().strip() | |
| 231 if stdout: | |
| 232 sys.stdout.write( '%s\n' % stdout ) | |
| 233 else: | |
| 234 raise Exception | |
| 235 except: | |
| 236 sys.stdout.write( 'Could not determine Tophat version\n' ) | |
| 237 | |
| 238 # Color or base space | |
| 239 space = '' | |
| 240 if options.color_space: | |
| 241 space = '-C' | |
| 242 | |
| 243 | |
| 244 #reads = options.input1 | |
| 245 file = open(options.input1,"r") | |
| 246 lines = file.readlines() | |
| 247 inputFileNames = [] | |
| 248 accepted_hits_outputNames = [] | |
| 249 outputName = options.outputTxtFile | |
| 250 resDirName = os.path.dirname(outputName) + '/' | |
| 251 out = open(outputName, "w") | |
| 252 for line in lines: | |
| 253 tab = line.split() | |
| 254 inputFileNames.append(tab[1]) | |
| 255 aHitOutName = resDirName + tab[0] + '_' + options.accepted_hits_output_file | |
| 256 accepted_hits_outputNames.append(aHitOutName) | |
| 257 out.write(tab[0] + '\t' + aHitOutName + '\n') | |
| 258 file.close() | |
| 259 out.close() | |
| 260 | |
| 261 if options.input2: | |
| 262 revFile = open(options.input2,"r") | |
| 263 lines = revFile.readlines() | |
| 264 inputRevFileNames = [] | |
| 265 for line in lines: | |
| 266 revTab = line.split() | |
| 267 inputRevFileNames.append(revTab[1]) | |
| 268 revFile.close() | |
| 269 | |
| 270 | |
| 271 # Creat bowtie index if necessary. | |
| 272 tmp_index_dirs = [] | |
| 273 index_paths = [] | |
| 274 tmp_index_dir = tempfile.mkdtemp(dir="%s" % os.getcwd()) | |
| 275 tmp_index_dirs.append(tmp_index_dir) | |
| 276 if options.own_file: | |
| 277 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) ) | |
| 278 index_paths.append(index_path) | |
| 279 try: | |
| 280 os.link( options.own_file, index_path + '.fa' ) | |
| 281 except: | |
| 282 # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension | |
| 283 pass | |
| 284 lCmdsTuples =[] | |
| 285 acronym = "tophat_index" | |
| 286 jobdb = TableJobAdaptatorFactory.createJobInstance() | |
| 287 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) | |
| 288 cmd_index = iLauncher.getSystemCommand("bowtie-build", [space, "-f %s" % options.own_file, index_path]) | |
| 289 cmd2Launch = [] | |
| 290 cmdStart = [] | |
| 291 cmdFinish = [] | |
| 292 cmd2Launch.append(cmd_index) | |
| 293 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) | |
| 294 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) | |
| 295 else: | |
| 296 for file in inputFileNames: | |
| 297 tmp_index_dir = tempfile.mkdtemp() | |
| 298 index_path = tmp_index_dir + '/' + os.path.basename(file).split('.')[0] | |
| 299 index_paths.append(index_path) | |
| 300 tmp_index_dirs.append(tmp_index_dir) | |
| 301 | |
| 302 | |
| 303 | |
| 304 acronym = "tophat" | |
| 305 jobdb = TableJobAdaptatorFactory.createJobInstance() | |
| 306 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) | |
| 307 lCmdsTuples = [] | |
| 308 dCutOut2Out = {} | |
| 309 lAllFile2remove = [] | |
| 310 # for inputFileName in inputFileNames: | |
| 311 for i in range(len(inputFileNames)): | |
| 312 lCutOutput = [] | |
| 313 lCutInputFile = splitFastQ(inputFileNames[i], 20000) | |
| 314 lAllFile2remove.extend(lCutInputFile) | |
| 315 if options.input2: | |
| 316 lCutPairInputFile = splitFastQ(inputRevFileNames[i], 20000) | |
| 317 lAllFile2remove.extend(lCutPairInputFile) | |
| 318 for j in range(len(lCutInputFile)): | |
| 319 cutOutput = "%s_out" % lCutInputFile[j] | |
| 320 lCutOutput.append(cutOutput) | |
| 321 lAllFile2remove.extend(lCutOutput) | |
| 322 cmd2Launch = [] | |
| 323 if options.input2: | |
| 324 inputRevFile = lCutPairInputFile[j] | |
| 325 else: | |
| 326 inputRevFile = "" | |
| 327 if options.own_file: | |
| 328 cmd2Launch.append(_createTopHatCommand(iLauncher, options, index_paths[0], lCutInputFile[j], inputRevFile, space)) | |
| 329 else: | |
| 330 cmd2Launch.append(_createTopHatCommand(iLauncher, options, index_paths[i], lCutInputFile[j], inputRevFile, space)) | |
| 331 cmdStart = [] | |
| 332 cmdFinish = ["shutil.copyfile( os.path.join( 'tophat_out', 'accepted_hits.bam' ), '%s')" % cutOutput] | |
| 333 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) | |
| 334 dCutOut2Out[accepted_hits_outputNames[i]] = lCutOutput | |
| 335 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True) | |
| 336 | |
| 337 joinBAM(dCutOut2Out) | |
| 338 FileUtils.removeFilesFromListIfExist(lAllFile2remove) | |
| 339 | |
| 340 if options.outputTar != None: | |
| 341 toTar(options.outputTar, accepted_hits_outputNames) | |
| 342 | |
| 343 | |
| 344 # Clean up temp dirs | |
| 345 for tmp_index_dir in tmp_index_dirs: | |
| 346 if os.path.exists( tmp_index_dir ): | |
| 347 shutil.rmtree( tmp_index_dir ) | |
| 348 | |
| 349 for tmp in tmp_files: | |
| 350 os.remove(tmp) | |
| 351 | |
| 352 | |
| 353 if __name__=="__main__": __main__() |
