annotate bismark_methylation_extractor.py @ 5:b100248c35b8 draft

Uploaded
author bgruening
date Mon, 09 Feb 2015 18:27:40 -0500
parents 91f07ff056ca
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3 import argparse, os, shutil, subprocess, sys, tempfile, fileinput
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4 import zipfile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5 from glob import glob
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7 def stop_err( msg ):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
8 sys.stderr.write( "%s\n" % msg )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
9 sys.exit()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
10
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
11 def zipper(dir, zip_file):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
12 zip = zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
13 root_len = len(os.path.abspath(dir))
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
14 for root, dirs, files in os.walk(dir):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
15 archive_root = os.path.abspath(root)[root_len:]
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
16 for f in files:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
17 fullpath = os.path.join(root, f)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
18 archive_name = os.path.join(archive_root, f)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
19 zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
20 zip.close()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
21 return zip_file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
22
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
23 def __main__():
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
24 #Parse Command Line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
25 parser = argparse.ArgumentParser(description='Wrapper for the bismark methylation caller.')
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
26
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
27 # input options
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
28 parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
29
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
30 parser.add_argument( '--infile', help='Input file in SAM or BAM format.' )
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
31 parser.add_argument( '--single-end', dest='single_end', action="store_true" )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
32 parser.add_argument( '--paired-end', dest='paired_end', action="store_true" )
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
33
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
34 parser.add_argument( '--report-file', dest='report_file' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
35 parser.add_argument( '--comprehensive', action="store_true" )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
36 parser.add_argument( '--merge-non-cpg', dest='merge_non_cpg', action="store_true" )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
37 parser.add_argument( '--no-overlap', dest='no_overlap', action="store_true" )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
38 parser.add_argument( '--compress' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
39 parser.add_argument( '--ignore-bps', dest='ignore_bps', type=int )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
40
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
41 # OT - original top strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
42 parser.add_argument( '--cpg_ot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
43 parser.add_argument( '--chg_ot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
44 parser.add_argument( '--chh_ot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
45 # CTOT - complementary to original top strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
46 parser.add_argument( '--cpg_ctot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
47 parser.add_argument( '--chg_ctot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
48 parser.add_argument( '--chh_ctot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
49 # OB - original bottom strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
50 parser.add_argument( '--cpg_ob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
51 parser.add_argument( '--chg_ob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
52 parser.add_argument( '--chh_ob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
53 # CTOT - complementary to original bottom strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
54 parser.add_argument( '--cpg_ctob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
55 parser.add_argument( '--chg_ctob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
56 parser.add_argument( '--chh_ctob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
57
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
58 parser.add_argument( '--cpg_context' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
59 parser.add_argument( '--chg_context' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
60 parser.add_argument( '--chh_context' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
61
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
62 parser.add_argument( '--non_cpg_context' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
63
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
64 parser.add_argument( '--non_cpg_context_ot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
65 parser.add_argument( '--non_cpg_context_ctot' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
66 parser.add_argument( '--non_cpg_context_ob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
67 parser.add_argument( '--non_cpg_context_ctob' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
68
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
69 args = parser.parse_args()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
70
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
71
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
72 # Build methylation extractor command
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
73 output_dir = tempfile.mkdtemp()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
74 cmd = 'bismark_methylation_extractor --no_header -o %s %s %s'
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
75 if args.bismark_path:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
76 # add the path to the bismark perl scripts, that is needed for galaxy
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
77 cmd = os.path.join(args.bismark_path, cmd)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
78
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
79 additional_opts = ''
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
80 # Set up all options
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
81 if args.single_end:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
82 additional_opts += ' --single-end '
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
83 else:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
84 additional_opts += ' --paired-end '
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
85 if args.no_overlap:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
86 additional_opts += ' --no_overlap '
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
87 if args.ignore_bps:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
88 additional_opts += ' --ignore %s ' % args.ignore_bps
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
89 if args.comprehensive:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
90 additional_opts += ' --comprehensive '
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
91 if args.merge_non_cpg:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
92 additional_opts += ' --merge_non_CpG '
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
93 if args.report_file:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
94 additional_opts += ' --report '
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
95
3
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
96 #detect BAM file, use samtools view if it is a bam file
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
97 f = open (args.infile, 'rb')
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
98 sig = f.read(4)
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
99 f.close()
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
100 if sig == '\x1f\x8b\x08\x04' :
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
101 cmd = cmd % (output_dir, additional_opts, '-')
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
102 cmd = 'samtools view %s | %s' % (args.infile, cmd )
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
103 else :
91f07ff056ca Uploaded
bgruening
parents: 0
diff changeset
104 cmd = cmd % (output_dir, additional_opts, args.infile)
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
105
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
106 # Run
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
107 try:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
108 tmp_out = tempfile.NamedTemporaryFile().name
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
109 tmp_stdout = open( tmp_out, 'wb' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
110 tmp_err = tempfile.NamedTemporaryFile().name
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
111 tmp_stderr = open( tmp_err, 'wb' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
112 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
113 returncode = proc.wait()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
114 tmp_stderr.close()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
115 # get stderr, allowing for case where it's very large
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
116 tmp_stderr = open( tmp_err, 'rb' )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
117 stderr = ''
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
118 buffsize = 1048576
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
119 try:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
120 while True:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
121 stderr += tmp_stderr.read( buffsize )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
122 if not stderr or len( stderr ) % buffsize != 0:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
123 break
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
124 except OverflowError:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
125 pass
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
126 tmp_stdout.close()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
127 tmp_stderr.close()
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
128 if returncode != 0:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
129 raise Exception, stderr
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
130
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
131 # TODO: look for errors in program output.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
132 except Exception, e:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
133 stop_err( 'Error in bismark methylation extractor:\n' + str( e ) )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
134
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
135
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
136 # collect and copy output files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
137
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
138 if args.compress:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
139 zipper(output_dir, args.compress)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
140
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
141
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
142 if args.cpg_ot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
143 shutil.move( glob(os.path.join( output_dir, '*CpG_OT_*'))[0], args.cpg_ot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
144 if args.chg_ot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
145 shutil.move( glob(os.path.join( output_dir, '*CHG_OT_*'))[0], args.chg_ot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
146 if args.chh_ot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
147 shutil.move( glob(os.path.join( output_dir, '*CHH_OT_*'))[0], args.chh_ot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
148 if args.cpg_ctot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
149 shutil.move( glob(os.path.join( output_dir, '*CpG_CTOT_*'))[0], args.cpg_ctot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
150 if args.chg_ctot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
151 shutil.move( glob(os.path.join( output_dir, '*CHG_CTOT_*'))[0], args.chg_ctot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
152 if args.chh_ctot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
153 shutil.move( glob(os.path.join( output_dir, '*CHH_CTOT_*'))[0], args.chh_ctot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
154 if args.cpg_ob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
155 shutil.move( glob(os.path.join( output_dir, '*CpG_OB_*'))[0], args.cpg_ob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
156 if args.chg_ob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
157 shutil.move( glob(os.path.join( output_dir, '*CHG_OB_*'))[0], args.chg_ob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
158 if args.chh_ob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
159 shutil.move( glob(os.path.join( output_dir, '*CHH_OB_*'))[0], args.chh_ob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
160 if args.cpg_ctob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
161 shutil.move( glob(os.path.join( output_dir, '*CpG_CTOB_*'))[0], args.cpg_ctob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
162 if args.chg_ctob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
163 shutil.move( glob(os.path.join( output_dir, '*CHG_CTOB_*'))[0], args.chg_ctob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
164 if args.chh_ctob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
165 shutil.move( glob(os.path.join( output_dir, '*CHH_CTOB_*'))[0], args.chh_ctob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
166
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
167 # context-dependent methylation output files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
168 if args.cpg_context:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
169 shutil.move( glob(os.path.join( output_dir, '*CpG_context_*'))[0], args.cpg_context )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
170 if args.chg_context:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
171 shutil.move( glob(os.path.join( output_dir, '*CHG_context_*'))[0], args.chg_context )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
172 if args.chh_context:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
173 shutil.move( glob(os.path.join( output_dir, '*CHH_context_*'))[0], args.chh_context )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
174
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
175 if args.non_cpg_context:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
176 shutil.move( glob(os.path.join( output_dir, '*Non_CpG_context_*'))[0], args.non_cpg_context )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
177
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
178 if args.non_cpg_context_ot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
179 shutil.move( glob(os.path.join( output_dir, '*Non_CpG_OT_*'))[0], args.non_cpg_context_ot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
180 if args.non_cpg_context_ctot:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
181 shutil.move( glob(os.path.join( output_dir, '*Non_CpG_CTOT_*'))[0], args.non_cpg_context_ctot )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
182 if args.non_cpg_context_ob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
183 shutil.move( glob(os.path.join( output_dir, '*Non_CpG_OB_*'))[0], args.non_cpg_context_ob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
184 if args.non_cpg_context_ctob:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
185 shutil.move( glob(os.path.join( output_dir, '*Non_CpG_CTOB_*'))[0], args.non_cpg_context_ctob )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
186
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
187
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
188
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
189 if args.report_file:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
190 shutil.move( glob(os.path.join( output_dir, '*_splitting_report*'))[0], args.report_file )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
191
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
192
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
193 # Clean up temp dirs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
194 if os.path.exists( output_dir ):
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
195 shutil.rmtree( output_dir )
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
196
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
197 if __name__=="__main__": __main__()