annotate bwa_mem.py @ 0:6820983ba5d5 draft

Uploaded
author crs4
date Tue, 18 Mar 2014 07:49:22 -0400
parents
children ebb02ba5987c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
1 # -*- coding: utf-8 -*-
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
2 #!/usr/bin/env python
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
3 ## yufei.luo@gustave.roussy 22/07/2013
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
4 ## Copyright © 2014 CRS4 Srl. http://www.crs4.it/
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
5 ## Modified by:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
6 ## Nicola Soranzo <nicola.soranzo@crs4.it>
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
7
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
8 """
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
9 Runs BWA on single-end or paired-end data.
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
10 Produces a SAM file containing the mappings.
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
11 Works with BWA version >= 0.7.5.
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
12 NOTICE: In this wrapper, we only use 'mem' for mapping step.
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
13
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
14 usage: bwa_mem.py [options]
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
15
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
16 See below for options
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
17 """
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
18
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
19 import optparse, os, shutil, subprocess, sys, tempfile
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
20
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
21 def stop_err( msg ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
22 sys.stderr.write( '%s\n' % msg )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
23 sys.exit()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
24
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
25 def check_is_double_encoded( fastq ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
26 # check that first read is bases, not one base followed by numbers
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
27 bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ]
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
28 nums = [ '0', '1', '2', '3' ]
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
29 for line in file( fastq, 'rb'):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
30 if not line.strip() or line.startswith( '@' ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
31 continue
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
32 if len( [ b for b in line.strip() if b in nums ] ) > 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
33 return False
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
34 elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
35 return True
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
36 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
37 raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space'
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
38 raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
39
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
40 def __main__():
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
41 descr = "bwa_mem.py: Map (long length) reads against a reference genome with BWA-MEM."
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
42 parser = optparse.OptionParser(description=descr)
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
43 parser.add_option( '-t', '--threads', default=1, help='The number of threads to use [1]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
44 parser.add_option( '--ref', help='The reference genome to use or index' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
45 parser.add_option( '-f', '--fastq', help='The (forward) fastq file to use for the mapping' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
46 parser.add_option( '-F', '--rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
47 parser.add_option( '-u', '--output', help='The file to save the output (SAM format)' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
48 parser.add_option( '-g', '--genAlignType', help='The type of pairing (single or paired)' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
49 parser.add_option( '--params', help='Parameter setting to use (pre_set or full)' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
50 parser.add_option( '-s', '--fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
51 parser.add_option( '-D', '--dbkey', help='Dbkey for reference genome' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
52
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
53 parser.add_option( '-k', '--minEditDistSeed', default=19, type=int, help='Minimum edit distance to the seed [19]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
54 parser.add_option( '-w', '--bandWidth', default=100, type=int, help='Band width for banded alignment [100]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
55 parser.add_option( '-d', '--offDiagonal', default=100, type=int, help='off-diagonal X-dropoff [100]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
56 parser.add_option( '-r', '--internalSeeds', default=1.5, type=float, help='look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
57 parser.add_option( '-c', '--seedsOccurrence', default=10000, type=int, help='skip seeds with more than INT occurrences [10000]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
58 parser.add_option( '-S', '--mateRescue', default=False, help='skip mate rescue' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
59 parser.add_option( '-P', '--skipPairing', default=False, help='skpe pairing, mate rescue performed unless -S also in use' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
60 parser.add_option( '-A', '--seqMatch', default=1, type=int, help='score of a sequence match' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
61 parser.add_option( '-B', '--mismatch', default=4, type=int, help='penalty for a mismatch' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
62 parser.add_option( '-O', '--gapOpen', default=6, type=int, help='gap open penalty' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
63 parser.add_option( '-E', '--gapExtension', default=None, help='gap extension penalty; a gap of size k cost {-O} + {-E}*k [1]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
64 parser.add_option( '-L', '--clipping', default=5, type=int, help='penalty for clipping [5]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
65 parser.add_option( '-U', '--unpairedReadpair', default=17, type=int, help='penalty for an unpaired read pair [17]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
66 parser.add_option( '-p', '--interPairEnd', default=False, help='first query file consists of interleaved paired-end sequences' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
67 parser.add_option( '--rgid', help='Read group identifier' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
68 parser.add_option( '--rgsm', help='Sample' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
69 parser.add_option( '--rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT', 'PACBIO' ], help='Platform/technology used to produce the reads' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
70 parser.add_option( '--rglb', help='Library name' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
71 parser.add_option( '--rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
72 parser.add_option( '--rgcn', help='Sequencing center that produced the read' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
73 parser.add_option( '--rgds', help='Description' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
74 parser.add_option( '--rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
75 parser.add_option( '--rgfo', help='Flow order' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
76 parser.add_option( '--rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
77 parser.add_option( '--rgpg', help='Programs used for processing the read group' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
78 parser.add_option( '--rgpi', help='Predicted median insert size' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
79 parser.add_option( '-T', '--minScore', default=30, type=int, help='minimum score to output [30]' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
80 parser.add_option( '-M', '--mark', default=False, help='mark shorter split hits as secondary (for Picard/GATK compatibility)' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
81 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
82 (options, args) = parser.parse_args()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
83 if len(args) > 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
84 parser.error('Wrong number of arguments')
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
85
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
86 # output version # of tool
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
87 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
88 tmp = tempfile.NamedTemporaryFile().name
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
89 tmp_stdout = open( tmp, 'wb' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
90 proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
91 tmp_stdout.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
92 returncode = proc.wait()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
93 stdout = None
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
94 for line in open( tmp_stdout.name, 'rb' ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
95 if line.lower().find( 'version' ) >= 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
96 stdout = line.strip()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
97 break
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
98 if stdout:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
99 sys.stdout.write( 'BWA %s\n' % stdout )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
100 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
101 raise Exception
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
102 except:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
103 sys.stdout.write( 'Could not determine BWA version\n' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
104
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
105 fastq = options.fastq
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
106 if options.rfastq:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
107 rfastq = options.rfastq
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
108
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
109 # make temp directory for placement of indices
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
110 tmp_index_dir = tempfile.mkdtemp()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
111 tmp_dir = tempfile.mkdtemp()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
112 # index if necessary
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
113 if options.fileSource == 'history':
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
114 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
115 ref_file_name = ref_file.name
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
116 ref_file.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
117 os.symlink( options.ref, ref_file_name )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
118 # determine which indexing algorithm to use, based on size
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
119 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
120 size = os.stat( options.ref ).st_size
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
121 if size <= 2**30:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
122 indexingAlg = 'is'
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
123 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
124 indexingAlg = 'bwtsw'
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
125 except:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
126 indexingAlg = 'is'
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
127 indexing_cmds = '-a %s' % indexingAlg
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
128 cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
129 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
130 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
131 tmp_stderr = open( tmp, 'wb' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
132 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
133 returncode = proc.wait()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
134 tmp_stderr.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
135 # get stderr, allowing for case where it's very large
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
136 tmp_stderr = open( tmp, 'rb' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
137 stderr = ''
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
138 buffsize = 1048576
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
139 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
140 while True:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
141 stderr += tmp_stderr.read( buffsize )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
142 if not stderr or len( stderr ) % buffsize != 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
143 break
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
144 except OverflowError:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
145 pass
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
146 tmp_stderr.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
147 if returncode != 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
148 raise Exception, stderr
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
149 except Exception, e:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
150 # clean up temp dirs
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
151 if os.path.exists( tmp_index_dir ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
152 shutil.rmtree( tmp_index_dir )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
153 if os.path.exists( tmp_dir ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
154 shutil.rmtree( tmp_dir )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
155 stop_err( 'Error indexing reference sequence. ' + str( e ) )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
156 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
157 ref_file_name = options.ref
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
158 # if options.illumina13qual:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
159 # illumina_quals = "-I"
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
160 # else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
161 # illumina_quals = ""
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
162
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
163 # set up aligning and generate aligning command args
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
164 start_cmds = '-t %s ' % options.threads
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
165 if options.params == 'pre_set':
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
166 # aligning_cmds = '-t %s %s' % ( options.threads, illumina_quals )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
167 #start_cmds = '-t %s ' % options.threads
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
168 end_cmds = ' '
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
169 print start_cmds, end_cmds
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
170
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
171 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
172 end_cmds = '-k %s -w %s -d %s -r %s -c %s -A %s -B %s -O %s -L %s -U %s -T %s ' % (options.minEditDistSeed, options.bandWidth, options.offDiagonal, options.internalSeeds, options.seedsOccurrence, options.seqMatch, options.mismatch, options.gapOpen, options.clipping, options.unpairedReadpair, options.minScore)
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
173 if options.mateRescue:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
174 end_cmds += '-S '
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
175 if options.skipPairing:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
176 end_cmds += '-P '
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
177 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
178 if options.skipPairing:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
179 print "Option Error and will not be considered, you should also choose 'skip mate rescue -S' option! "
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
180 if options.gapExtension != None:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
181 end_cmds += '-E %s ' % options.gapExtension
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
182
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
183 if options.rgid:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
184 if not options.rglb or not options.rgpl or not options.rgsm or not options.rglb:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
185 stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
186 # readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
187 readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
188 if options.rgpu:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
189 readGroup += '\tPU:%s' % options.rgpu
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
190 if options.rgcn:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
191 readGroup += '\tCN:%s' % options.rgcn
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
192 if options.rgds:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
193 readGroup += '\tDS:%s' % options.rgds
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
194 if options.rgdt:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
195 readGroup += '\tDT:%s' % options.rgdt
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
196 if options.rgfo:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
197 readGroup += '\tFO:%s' % options.rgfo
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
198 if options.rgks:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
199 readGroup += '\tKS:%s' % options.rgks
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
200 if options.rgpg:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
201 readGroup += '\tPG:%s' % options.rgpg
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
202 if options.rgpi:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
203 readGroup += '\tPI:%s' % options.rgpi
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
204 end_cmds += ' -R "%s" ' % readGroup
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
205
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
206 if options.interPairEnd:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
207 end_cmds += '-p %s ' % options.interPairEnd
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
208 if options.mark:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
209 end_cmds += '-M '
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
210
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
211
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
212 if options.genAlignType == 'paired':
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
213 cmd = 'bwa mem %s %s %s %s %s > %s' % ( start_cmds, ref_file_name, fastq, rfastq, end_cmds, options.output )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
214 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
215 cmd = 'bwa mem %s %s %s %s > %s' % ( start_cmds, ref_file_name, fastq, end_cmds, options.output )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
216
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
217 # perform alignments
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
218 buffsize = 1048576
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
219 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
220 # need to nest try-except in try-finally to handle 2.4
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
221 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
222 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
223 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
224 tmp_stderr = open( tmp, 'wb' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
225 print "The cmd is %s" % cmd
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
226 proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
227 returncode = proc.wait()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
228 tmp_stderr.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
229 # get stderr, allowing for case where it's very large
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
230 tmp_stderr = open( tmp, 'rb' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
231 stderr = ''
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
232 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
233 while True:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
234 stderr += tmp_stderr.read( buffsize )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
235 if not stderr or len( stderr ) % buffsize != 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
236 break
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
237 except OverflowError:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
238 pass
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
239 tmp_stderr.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
240 if returncode != 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
241 raise Exception, stderr
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
242 except Exception, e:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
243 raise Exception, 'Error generating alignments. ' + str( e )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
244 # remove header if necessary
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
245 if options.suppressHeader == 'true':
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
246 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
247 tmp_out_name = tmp_out.name
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
248 tmp_out.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
249 try:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
250 shutil.move( options.output, tmp_out_name )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
251 except Exception, e:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
252 raise Exception, 'Error moving output file before removing headers. ' + str( e )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
253 fout = file( options.output, 'w' )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
254 for line in file( tmp_out.name, 'r' ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
255 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
256 fout.write( line )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
257 fout.close()
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
258 # check that there are results in the output file
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
259 if os.path.getsize( options.output ) > 0:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
260 sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
261 else:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
262 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
263 except Exception, e:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
264 stop_err( 'The alignment failed.\n' + str( e ) )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
265 finally:
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
266 # clean up temp dir
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
267 if os.path.exists( tmp_index_dir ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
268 shutil.rmtree( tmp_index_dir )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
269 if os.path.exists( tmp_dir ):
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
270 shutil.rmtree( tmp_dir )
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
271
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
272 if __name__ == "__main__":
6820983ba5d5 Uploaded
crs4
parents:
diff changeset
273 __main__()