annotate bwa_wrapper.py @ 0:d6ba40f6c824

first commit
author cmonjeau
date Mon, 24 Aug 2015 09:29:12 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
1 #!/usr/bin/env python
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
2
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
3 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
4 Runs BWA on single-end or paired-end data.
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
5 Produces a SAM file containing the mappings.
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
6 Works with BWA version 0.5.9.
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
7
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
8 usage: bwa_wrapper.py [options]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
9
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
10 See below for options
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
11 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
12
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
13 import optparse, os, shutil, subprocess, sys, tempfile
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
14 import glob
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
15 import gzip, zipfile, tarfile
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
16
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
17 def stop_err( msg ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
18 sys.stderr.write( '%s\n' % msg )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
19 sys.exit()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
20
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
21 def check_is_double_encoded( fastq ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
22 # check that first read is bases, not one base followed by numbers
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
23 bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
24 nums = [ '0', '1', '2', '3' ]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
25 for line in file( fastq, 'rb'):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
26 if not line.strip() or line.startswith( '@' ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
27 continue
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
28 if len( [ b for b in line.strip() if b in nums ] ) > 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
29 return False
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
30 elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
31 return True
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
32 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
33 raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
34 raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
35
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
36 def __main__():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
37 #Parse Command Line
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
38 parser = optparse.OptionParser()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
39 parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
40 parser.add_option( '-c', '--color-space', dest='color_space', action='store_true', help='If the input files are SOLiD format' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
41 parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
42 parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
43 parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
44 parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
45 parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
46 parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
47 parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
48 parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
49 parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
50 parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
51 parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
52 parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
53 parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
54 parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
55 parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
56 parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
57 parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', default=None, help='Proceed with suboptimal alignments even if the top hit is a repeat' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
58 parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
59 parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Maximum number of alignments to output in the XA tag for reads paired properly' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
60 parser.add_option( '', '--outputTopNDisc', dest='outputTopNDisc', help='Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
61 parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
62 parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
63 parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
64 parser.add_option( '', '--rgcn', dest='rgcn', help='Sequencing center that produced the read' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
65 parser.add_option( '', '--rgds', dest='rgds', help='Description' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
66 parser.add_option( '', '--rgdt', dest='rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
67 parser.add_option( '', '--rgfo', dest='rgfo', help='Flow order' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
68 parser.add_option( '', '--rgks', dest='rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
69 parser.add_option( '', '--rglb', dest='rglb', help='Library name' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
70 parser.add_option( '', '--rgpg', dest='rgpg', help='Programs used for processing the read group' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
71 parser.add_option( '', '--rgpi', dest='rgpi', help='Predicted median insert size' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
72 parser.add_option( '', '--rgpl', dest='rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT' and 'PACBIO' ], help='Platform/technology used to produce the reads' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
73 parser.add_option( '', '--rgpu', dest='rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
74 parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
75 parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
76 parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
77 parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
78 parser.add_option( '-I', '--illumina1.3', dest='illumina13qual', help='Input FASTQ files have Illuina 1.3 quality scores' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
79 (options, args) = parser.parse_args()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
80
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
81 tmp_input_dir = tempfile.mkdtemp()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
82 tmp_output_dir= tempfile.mkdtemp()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
83
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
84
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
85 myarchive = zipfile.ZipFile(options.fastq, 'r', allowZip64=True)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
86 myarchive.extractall(tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
87
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
88 for fastq in glob.glob(tmp_input_dir+'/*'):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
89
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
90 sam_output_file=tmp_output_dir+'/'+os.path.splitext(os.path.basename(fastq))[0]+'.sam'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
91 create_sam=open(sam_output_file, "w")
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
92 create_sam.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
93
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
94 # output version # of tool
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
95 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
96 tmp = tempfile.NamedTemporaryFile().name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
97 tmp_stdout = open( tmp, 'wb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
98 proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
99 tmp_stdout.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
100 returncode = proc.wait()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
101 stdout = None
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
102 for line in open( tmp_stdout.name, 'rb' ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
103 if line.lower().find( 'version' ) >= 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
104 stdout = line.strip()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
105 break
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
106 if stdout:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
107 sys.stdout.write( 'BWA %s\n' % stdout )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
108 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
109 raise Exception
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
110 except:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
111 sys.stdout.write( 'Could not determine BWA version\n' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
112
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
113 # check for color space fastq that's not double-encoded and exit if appropriate
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
114 if options.color_space:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
115 if not check_is_double_encoded( options.fastq ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
116 stop_err( 'Your file must be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
117 #if options.genAlignType == 'paired':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
118 #if not check_is_double_encoded( options.rfastq ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
119 #stop_err( 'Your reverse reads file must also be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
120
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
121 #fastq = options.fastq
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
122 #if options.rfastq:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
123 #rfastq = options.rfastq
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
124
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
125 # set color space variable
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
126 if options.color_space:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
127 color_space = '-c'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
128 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
129 color_space = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
130
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
131 # make temp directory for placement of indices
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
132 tmp_index_dir = tempfile.mkdtemp()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
133 tmp_dir = tempfile.mkdtemp()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
134 # index if necessary
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
135 if options.fileSource == 'history' and not options.do_not_build_index:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
136 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
137 ref_file_name = ref_file.name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
138 ref_file.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
139 os.symlink( options.ref, ref_file_name )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
140 # determine which indexing algorithm to use, based on size
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
141 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
142 size = os.stat( options.ref ).st_size
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
143 if size <= 2**30:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
144 indexingAlg = 'is'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
145 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
146 indexingAlg = 'bwtsw'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
147 except:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
148 indexingAlg = 'is'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
149 indexing_cmds = '%s -a %s' % ( color_space, indexingAlg )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
150 cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
151 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
152 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
153 tmp_stderr = open( tmp, 'wb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
154 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
155 returncode = proc.wait()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
156 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
157 # get stderr, allowing for case where it's very large
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
158 tmp_stderr = open( tmp, 'rb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
159 stderr = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
160 buffsize = 1048576
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
161 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
162 while True:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
163 stderr += tmp_stderr.read( buffsize )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
164 if not stderr or len( stderr ) % buffsize != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
165 break
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
166 except OverflowError:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
167 pass
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
168 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
169 if returncode != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
170 raise Exception, stderr
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
171 except Exception, e:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
172 # clean up temp dirs
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
173 if os.path.exists( tmp_index_dir ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
174 shutil.rmtree( tmp_index_dir )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
175 if os.path.exists( tmp_dir ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
176 shutil.rmtree( tmp_dir )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
177 stop_err( 'Error indexing reference sequence. ' + str( e ) )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
178 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
179 ref_file_name = options.ref
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
180 if options.illumina13qual:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
181 illumina_quals = "-I"
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
182 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
183 illumina_quals = ""
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
184
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
185 # set up aligning and generate aligning command options
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
186 if options.params == 'pre_set':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
187 aligning_cmds = '-t %s %s %s' % ( options.threads, color_space, illumina_quals )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
188 gen_alignment_cmds = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
189 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
190 if options.maxEditDist != '0':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
191 editDist = options.maxEditDist
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
192 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
193 editDist = options.fracMissingAligns
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
194 if options.seed != '-1':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
195 seed = '-l %s' % options.seed
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
196 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
197 seed = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
198 if options.suboptAlign:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
199 suboptAlign = '-R "%s"' % ( options.suboptAlign )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
200 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
201 suboptAlign = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
202 if options.noIterSearch == 'true':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
203 noIterSearch = '-N'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
204 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
205 noIterSearch = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
206 aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s %s %s' % \
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
207 ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel,
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
208 options.disallowIndel, seed, options.maxEditDistSeed, options.threads,
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
209 options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty,
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
210 suboptAlign, noIterSearch, color_space, illumina_quals )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
211 #if options.genAlignType == 'paired':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
212 #gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
213 #if options.outputTopNDisc:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
214 #gen_alignment_cmds += ' -N %s' % options.outputTopNDisc
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
215
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
216 gen_alignment_cmds = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
217 if options.rgid:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
218 if not options.rglb or not options.rgpl or not options.rgsm:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
219 stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
220 readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
221 if options.rgcn:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
222 readGroup += '\tCN:%s' % options.rgcn
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
223 if options.rgds:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
224 readGroup += '\tDS:%s' % options.rgds
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
225 if options.rgdt:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
226 readGroup += '\tDT:%s' % options.rgdt
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
227 if options.rgfo:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
228 readGroup += '\tFO:%s' % options.rgfo
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
229 if options.rgks:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
230 readGroup += '\tKS:%s' % options.rgks
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
231 if options.rgpg:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
232 readGroup += '\tPG:%s' % options.rgpg
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
233 if options.rgpi:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
234 readGroup += '\tPI:%s' % options.rgpi
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
235 if options.rgpu:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
236 readGroup += '\tPU:%s' % options.rgpu
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
237 gen_alignment_cmds += ' -r "%s"' % readGroup
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
238 if options.outputTopN:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
239 gen_alignment_cmds += ' -n %s' % options.outputTopN
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
240 # set up output files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
241 tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
242 tmp_align_out_name = tmp_align_out.name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
243 tmp_align_out.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
244 tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
245 tmp_align_out2_name = tmp_align_out2.name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
246 tmp_align_out2.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
247 # prepare actual aligning and generate aligning commands
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
248 cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, fastq, tmp_align_out_name )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
249 cmd2b = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
250 cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, fastq, sam_output_file )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
251 # perform alignments
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
252 buffsize = 1048576
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
253 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
254 # need to nest try-except in try-finally to handle 2.4
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
255 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
256 # align
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
257 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
258 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
259 tmp_stderr = open( tmp, 'wb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
260 proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
261 returncode = proc.wait()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
262 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
263 # get stderr, allowing for case where it's very large
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
264 tmp_stderr = open( tmp, 'rb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
265 stderr = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
266 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
267 while True:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
268 stderr += tmp_stderr.read( buffsize )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
269 if not stderr or len( stderr ) % buffsize != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
270 break
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
271 except OverflowError:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
272 pass
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
273 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
274 if returncode != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
275 raise Exception, stderr
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
276 except Exception, e:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
277 raise Exception, 'Error aligning sequence. ' + str( e )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
278 # and again if paired data
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
279 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
280 if cmd2b:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
281 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
282 tmp_stderr = open( tmp, 'wb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
283 proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
284 returncode = proc.wait()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
285 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
286 # get stderr, allowing for case where it's very large
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
287 tmp_stderr = open( tmp, 'rb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
288 stderr = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
289 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
290 while True:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
291 stderr += tmp_stderr.read( buffsize )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
292 if not stderr or len( stderr ) % buffsize != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
293 break
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
294 except OverflowError:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
295 pass
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
296 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
297 if returncode != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
298 raise Exception, stderr
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
299 except Exception, e:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
300 raise Exception, 'Error aligning second sequence. ' + str( e )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
301 # generate align
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
302 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
303 tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
304 tmp_stderr = open( tmp, 'wb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
305 proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
306 returncode = proc.wait()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
307 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
308 # get stderr, allowing for case where it's very large
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
309 tmp_stderr = open( tmp, 'rb' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
310 stderr = ''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
311 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
312 while True:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
313 stderr += tmp_stderr.read( buffsize )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
314 if not stderr or len( stderr ) % buffsize != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
315 break
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
316 except OverflowError:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
317 pass
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
318 tmp_stderr.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
319 if returncode != 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
320 raise Exception, stderr
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
321 except Exception, e:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
322 raise Exception, 'Error generating alignments. ' + str( e )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
323 # remove header if necessary
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
324 if options.suppressHeader == 'true':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
325 tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
326 tmp_out_name = tmp_out.name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
327 tmp_out.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
328 try:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
329 shutil.move( sam_output_file, tmp_out_name )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
330 except Exception, e:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
331 raise Exception, 'Error moving output file before removing headers. ' + str( e )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
332 fout = file( sam_output_file, 'w' )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
333 for line in file( tmp_out.name, 'r' ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
334 if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
335 fout.write( line )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
336 fout.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
337 # check that there are results in the output file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
338 if os.path.getsize( sam_output_file ) > 0:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
339 sys.stdout.write( 'BWA run on single-end data')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
340 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
341 raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
342 except Exception, e:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
343 stop_err( 'The alignment failed.\n' + str( e ) )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
344 finally:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
345 # clean up temp dir
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
346 if os.path.exists( tmp_index_dir ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
347 shutil.rmtree( tmp_index_dir )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
348 if os.path.exists( tmp_dir ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
349 shutil.rmtree( tmp_dir )
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
350
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
351 # put all in an archive
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
352 mytotalzipfile=zipfile.ZipFile(options.output, 'w', allowZip64=True)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
353 os.chdir(tmp_output_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
354 for samfile in glob.glob(tmp_output_dir+'/*'):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
355 mytotalzipfile.write(os.path.basename(samfile))
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
356
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
357
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
358 if __name__=="__main__": __main__()