annotate BSseeker2/bs_seeker2-align.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
weilong-guo
parents: 0
diff changeset
1 #!/usr/bin/env python
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3 from optparse import OptionParser, OptionGroup
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4 import re
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5 import tempfile
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 from bs_align import output
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 from bs_align.bs_pair_end import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 from bs_align.bs_single_end import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 from bs_align.bs_rrbs import *
1
weilong-guo
parents: 0
diff changeset
10 #import re
weilong-guo
parents: 0
diff changeset
11 #from bs_utils.utils import *
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 if __name__ == '__main__':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15
1
weilong-guo
parents: 0
diff changeset
16 parser = OptionParser(usage="Usage: %prog {-i <single> | -1 <mate1> -2 <mate2>} -g <genome.fa> [options]")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 # option group 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 opt_group = OptionGroup(parser, "For single end reads")
1
weilong-guo
parents: 0
diff changeset
19 opt_group.add_option("-i", "--input", type="string", dest="infilename",help="Input read file (FORMAT: sequences, qseq, fasta, fastq). Ex: read.fa or read.fa.gz", metavar="INFILE")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20 parser.add_option_group(opt_group)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 # option group 2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23 opt_group = OptionGroup(parser, "For pair end reads")
1
weilong-guo
parents: 0
diff changeset
24 opt_group.add_option("-1", "--input_1", type="string", dest="infilename_1",help="Input read file, mate 1 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE")
weilong-guo
parents: 0
diff changeset
25 opt_group.add_option("-2", "--input_2", type="string", dest="infilename_2",help="Input read file, mate 2 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE")
weilong-guo
parents: 0
diff changeset
26 opt_group.add_option("-I", "--minins",type = "int",dest = "min_insert_size", help="The minimum insert size for valid paired-end alignments [Default: %default]", default = 0)
weilong-guo
parents: 0
diff changeset
27 opt_group.add_option("-X", "--maxins",type = "int",dest = "max_insert_size", help="The maximum insert size for valid paired-end alignments [Default: %default]", default = 500)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28 parser.add_option_group(opt_group)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30 # option group 3
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 opt_group = OptionGroup(parser, "Reduced Representation Bisulfite Sequencing Options")
1
weilong-guo
parents: 0
diff changeset
32 opt_group.add_option("-r", "--rrbs", action="store_true", dest="rrbs", default = False, help = 'Map reads to the Reduced Representation genome')
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 opt_group.add_option("-c", "--cut-site", type="string",dest="cut_format", help="Cutting sites of restriction enzyme. Ex: MspI(C-CGG), Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG). [Default: %default]", metavar="pattern", default = "C-CGG")
1
weilong-guo
parents: 0
diff changeset
34 opt_group.add_option("-L", "--low", type = "int", dest="rrbs_low_bound", help="Lower bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 20)
weilong-guo
parents: 0
diff changeset
35 opt_group.add_option("-U", "--up", type = "int", dest="rrbs_up_bound", help="Upper bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 500)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 parser.add_option_group(opt_group)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 # option group 4
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 opt_group = OptionGroup(parser, "General options")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40 opt_group.add_option("-t", "--tag", type="string", dest="taginfo",help="[Y]es for undirectional lib, [N]o for directional [Default: %default]", metavar="TAG", default = 'N')
1
weilong-guo
parents: 0
diff changeset
41 opt_group.add_option("-s","--start_base",type = "int",dest = "cutnumber1", help="The first cycle of the read to be mapped [Default: %default]", default = 1)
weilong-guo
parents: 0
diff changeset
42 opt_group.add_option("-e","--end_base",type = "int",dest = "cutnumber2", help="The last cycle of the read to be mapped [Default: %default]", default = 200)
weilong-guo
parents: 0
diff changeset
43 opt_group.add_option("-a", "--adapter", type="string", dest="adapter_file",help="Input text file of your adaptor sequences (to be trimmed from the 3'end of the reads, ). "
weilong-guo
parents: 0
diff changeset
44 "Input one seq for dir. lib., twon seqs for undir. lib. One line per sequence. "
weilong-guo
parents: 0
diff changeset
45 "Only the first 10bp will be used", metavar="FILE", default = '')
weilong-guo
parents: 0
diff changeset
46 opt_group.add_option("--am",type = "int",dest = "adapter_mismatch", help="Number of mismatches allowed in adapter [Default: %default]", default = 0)
weilong-guo
parents: 0
diff changeset
47 opt_group.add_option("-g", "--genome", type="string", dest="genome",help="Name of the reference genome (should be the same as \"-f\" in bs_seeker2-build.py ) [ex. chr21_hg18.fa]")
weilong-guo
parents: 0
diff changeset
48 opt_group.add_option("-m", "--mismatches",type = "float", dest="no_mismatches",help="Number of mismatches in one read [Default: %default]", default = 4)
weilong-guo
parents: 0
diff changeset
49 opt_group.add_option("--aligner", dest="aligner",help="Aligner program for short reads mapping: " + ', '.join(supported_aligners) + " [Default: %default]", metavar="ALIGNER", default = BOWTIE)
weilong-guo
parents: 0
diff changeset
50 opt_group.add_option("-p", "--path", dest="aligner_path", help="Path to the aligner program. Detected: " +' '*70+ '\t'.join(('%s: %s '+' '*70) % (al, aligner_path[al]) for al in sorted(supported_aligners)),
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 metavar="PATH"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 opt_group.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 opt_group.add_option("-l", "--split_line",type = "int", dest="no_split",help="Number of lines per split (the read file will be split into small files for mapping. The result will be merged. [Default: %default]", default = 4000000)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 opt_group.add_option("-o", "--output", type="string", dest="outfilename",help="The name of output file [INFILE.bs(se|pe|rrbs)]", metavar="OUTFILE")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 opt_group.add_option("-f", "--output-format", type="string", dest="output_format",help="Output format: "+', '.join(output.formats)+" [Default: %default]", metavar="FORMAT", default = output.BAM)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 opt_group.add_option("--no-header", action="store_true", dest="no_SAM_header",help="Suppress SAM header lines [Default: %default]", default = False)
1
weilong-guo
parents: 0
diff changeset
58 opt_group.add_option("--temp_dir", type="string", dest="temp_dir",help="The path to your temporary directory [Detected: %default]", metavar="PATH", default = tempfile.gettempdir())
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 opt_group.add_option("--XS",type = "string", dest="XS_filter",help="Filter definition for tag XS, format X,Y. X=0.8 and y=5 indicate that for one read, if #(mCH sites)/#(all CH sites)>0.8 and #(mCH sites)>5, then tag XS=1; or else tag XS=0. [Default: %default]", default = "0.5,5") # added by weilong
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 opt_group.add_option("--multiple-hit", action="store_true", dest="Output_multiple_hit", default = False, help = 'Output reads with multiple hits to file\"Multiple_hit.fa\"')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 opt_group.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 parser.add_option_group(opt_group)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 # option group 5
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 opt_group = OptionGroup(parser, "Aligner Options",
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 "You may specify any additional options for the aligner. You just have to prefix them with " +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 ', '.join('%s for %s' % (aligner_options_prefixes[aligner], aligner) for aligner in supported_aligners)+
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 ', and BS Seeker will pass them on. For example: --bt-p 4 will increase the number of threads for bowtie to 4, '
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 '--bt--tryhard will instruct bowtie to try as hard as possible to find valid alignments when they exist, and so on. '
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 'Be sure that you know what you are doing when using these options! Also, we don\'t do any validation on the values.')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 parser.add_option_group(opt_group)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77 # separate aligner options from BS Seeker options
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78 aligner_options = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79 bs_seeker_options = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 i = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81 while i < len(sys.argv):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82 arg = sys.argv[i]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 m = re.match(r'^%s' % '|'.join('(%s)'% aligner_options_prefixes[al] for al in supported_aligners), arg)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 if m:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 a_opt = arg.replace(m.group(0),'-',1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86 aligner_options[a_opt] = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 while i + 1 < len(sys.argv) and sys.argv[i+1][0] != '-':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88 aligner_options[a_opt].append(sys.argv[i+1])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89 i += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90 if len(aligner_options[a_opt]) == 0: # if it is a key-only option
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 aligner_options[a_opt] = True
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93 bs_seeker_options.append(arg)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 i += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 (options, args) = parser.parse_args(args = bs_seeker_options)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 # if no options were given by the user, print help and exit
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 if len(sys.argv) == 1:
1
weilong-guo
parents: 0
diff changeset
102 parser.print_help()
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 exit(0)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105 if options.version :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 show_version()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107 exit (-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109 show_version()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 # check parameters
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 # input read files
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113 if options.infilename and (options.infilename_1 or options.infilename_2):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 error('-i and [-1|-2] options are exclusive. You should use only one of them.')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 if not (options.infilename or (options.infilename_1 and options.infilename_2)):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 error('You should set either -i or -1 and -2 options.')
1
weilong-guo
parents: 0
diff changeset
118
weilong-guo
parents: 0
diff changeset
119 # Calculate the length of read
weilong-guo
parents: 0
diff changeset
120 if options.infilename :
weilong-guo
parents: 0
diff changeset
121 read_file = options.infilename
weilong-guo
parents: 0
diff changeset
122 elif options.infilename_1 :
weilong-guo
parents: 0
diff changeset
123 read_file = options.infilename_1
weilong-guo
parents: 0
diff changeset
124 else :
weilong-guo
parents: 0
diff changeset
125 error('You should at least specify -i or -1 options.')
weilong-guo
parents: 0
diff changeset
126
weilong-guo
parents: 0
diff changeset
127 try :
weilong-guo
parents: 0
diff changeset
128 if read_file.endswith(".gz") : # support input file ending with ".gz"
weilong-guo
parents: 0
diff changeset
129 read_inf = gzip.open(read_file, "rb")
weilong-guo
parents: 0
diff changeset
130 else :
weilong-guo
parents: 0
diff changeset
131 read_inf=open(read_file,"r")
weilong-guo
parents: 0
diff changeset
132 except IOError :
weilong-guo
parents: 0
diff changeset
133 print "[Error] Cannot open input file : %s" % read_file
weilong-guo
parents: 0
diff changeset
134 exit(-1)
weilong-guo
parents: 0
diff changeset
135 oneline = read_inf.readline()
weilong-guo
parents: 0
diff changeset
136 oneline = read_inf.readline() # get the second line
weilong-guo
parents: 0
diff changeset
137 read_len = min(len(oneline), (options.cutnumber2-options.cutnumber1))
weilong-guo
parents: 0
diff changeset
138 read_inf.close()
weilong-guo
parents: 0
diff changeset
139 # mismatch allowed: bowtie 1,build-in parameter '-m'; bowtie 2, post-filter paramter
weilong-guo
parents: 0
diff changeset
140 # mismatch should no greater than the read length
weilong-guo
parents: 0
diff changeset
141 no_mismatches = float(options.no_mismatches)
weilong-guo
parents: 0
diff changeset
142 if (no_mismatches < 1) :
weilong-guo
parents: 0
diff changeset
143 int_no_mismatches=int(no_mismatches * read_len)
weilong-guo
parents: 0
diff changeset
144 else :
weilong-guo
parents: 0
diff changeset
145 int_no_mismatches=int(no_mismatches)
weilong-guo
parents: 0
diff changeset
146
weilong-guo
parents: 0
diff changeset
147 str_no_mismatches=str(options.no_mismatches) # pass to specific mode
weilong-guo
parents: 0
diff changeset
148
weilong-guo
parents: 0
diff changeset
149
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150 # -t, directional / un-directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151 asktag=str(options.taginfo).upper()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152 if asktag not in 'YN':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153 error('-t option should be either Y or N, not %s' % asktag)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 # -a
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155 if options.aligner not in supported_aligners:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156 error('-a option should be: %s' % ' ,'.join(supported_aligners)+'.')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157 # path for aligner
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158 aligner_exec = os.path.expanduser( os.path.join(options.aligner_path or aligner_path[options.aligner], options.aligner) )
1
weilong-guo
parents: 0
diff changeset
159
weilong-guo
parents: 0
diff changeset
160
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 # -g
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162 if options.genome is None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163 error('-g is a required option')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164 genome = os.path.split(options.genome)[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165 genome_subdir = genome
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167 # try to guess the location of the reference genome for RRBS
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168 if options.rrbs:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169 if options.rrbs_low_bound and options.rrbs_up_bound:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 if options.cut_format == "C-CGG" :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171 genome_subdir += '_rrbs_%d_%d' % (options.rrbs_low_bound, options.rrbs_up_bound)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173 genome_subdir += '_rrbs_%s_%d_%d' % ( re.sub(",","-",re.sub("-", "", options.cut_format)), options.rrbs_low_bound, options.rrbs_up_bound)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175 possible_refs = filter(lambda dir: dir.startswith(genome+'_rrbs_'), os.listdir(options.dbpath))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176 if len(possible_refs) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177 genome_subdir = possible_refs[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179 error('Cannot localize unambiguously the reference genome for RRBS. '
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180 'Please, specify the options \"--low\" and \"--up\" that you used at the index-building step.\n'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181 'Possible choices are:\n' + '\n'.join([pr.split('_rrbs_')[-1].replace('_',', ') for pr in possible_refs]))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183 db_path = os.path.expanduser(os.path.join(options.dbpath, genome_subdir + '_' + options.aligner))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184
1
weilong-guo
parents: 0
diff changeset
185
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186 if not os.path.isdir(db_path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187 error('Index DIR \"' + genome_subdir + '..\" cannot be found in ' + options.dbpath +'.\n\tPlease run the bs_seeker2-build.py '
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
188 'to create it with the correct parameters for -g, -r, --low, --up and --aligner.')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
189
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
190 # default aligner options
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 aligner_options_defaults = {
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192 BOWTIE : { '-e' : 40*int_no_mismatches,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193 '--nomaqround' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194 '--norc' : True,
1
weilong-guo
parents: 0
diff changeset
195 #'-k' : 2,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
196 # -k=2; report two best hits, and filter by error rates
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
197 '--quiet' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
198 '--best' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
199 # '--suppress' : '2,5,6',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
200 '--sam' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
201 '--sam-nohead' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
202 '-p' : 2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203 },
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
204 BOWTIE2 : {
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
205 #'-M' : 5,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
206 '--norc' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
207 '--quiet' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
208 '-p' : 2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
209 '--sam-nohead' : True,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
210 # run bowtie2 in local mode by default
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
211 '--local' : '--end-to-end' not in aligner_options,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
212 #'--mm' : True,
1
weilong-guo
parents: 0
diff changeset
213 #'-k' : 2
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
214 },
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
215 SOAP : { '-v' : int_no_mismatches,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
216 '-p' : 2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
217 '-r' : 2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
218 '-M' : 4
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
219 },
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
220 RMAP : { '-M' : 2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
221 # to do # control for only mapping on + strand
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 }
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
223
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224 }
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226 if '--end-to-end' not in aligner_options:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
227 aligner_options_defaults[BOWTIE2].update({'-D' : 50})
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
228 #aligner_options_defaults[BOWTIE2].update({'-D' : 50, '-R': 3, '-N': 0, '-L': 15, '-i' : 'S,1,0.50'})
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
229 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
230 aligner_options_defaults[BOWTIE2].update({'-D' : 50, '-L': 15, '--score-min': 'L,-0.6,-0.6' })
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
232 aligner_options = dict(aligner_options_defaults[options.aligner], **aligner_options)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
234 aligner_options_string = lambda : ' %s ' % (' '.join(opt_key +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 (' ' + ' '.join(map(str,opt_val)) # join all values if the value is an array
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236 if type(opt_val) is list else
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237 ('' if type(opt_val) is bool and opt_val # output an empty string if it is a key-only option
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238 else ' ' +str(opt_val)) # output the value if it is a single value
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240 for opt_key, opt_val in aligner_options.iteritems() if opt_val not in [None, False]))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243 # tmp_path = (options.outfilename or options.infilename or options.infilename_1) +'-'+ options.aligner+ '-TMP'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244 # clear_dir(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246 if options.output_format not in output.formats:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247 error('Output format should be one of: ' + ', '.join(output.formats))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249 if options.outfilename:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250 outfilename = options.outfilename
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251 logfilename = outfilename
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
252 elif options.infilename is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 logfilename = options.infilename+'_'+ ('rr' if options.rrbs else '') + 'bsse'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254 outfilename = logfilename + '.' + options.output_format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 logfilename = options.infilename_1+'_'+ ('rr' if options.rrbs else '') + 'bspe'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257 outfilename = logfilename + '.' + options.output_format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
258
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 outfilename = os.path.expanduser(outfilename)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260 logfilename = os.path.expanduser(logfilename)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
261 outfile = output.outfile(outfilename, options.output_format, deserialize(os.path.join(db_path, 'refname')), ' '.join(sys.argv), options.no_SAM_header)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 open_log(logfilename+'.bs_seeker2_log')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
264
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265 aligner_title = options.aligner
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
266 if options.aligner == BOWTIE2 :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
267 if '--end-to-end' in aligner_options :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268 aligner_title = aligner_title + "-e2e"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
269 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
270 aligner_title = aligner_title + "-local"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
271
1
weilong-guo
parents: 0
diff changeset
272 if options.aligner == BOWTIE :
weilong-guo
parents: 0
diff changeset
273 logm("Mode: Bowtie")
weilong-guo
parents: 0
diff changeset
274 elif options.aligner == BOWTIE2 :
weilong-guo
parents: 0
diff changeset
275 if '--end-to-end' not in aligner_options :
weilong-guo
parents: 0
diff changeset
276 logm("Mode: Bowtie2, local alignment")
weilong-guo
parents: 0
diff changeset
277 else :
weilong-guo
parents: 0
diff changeset
278 logm("Mode: Bowtie2, end-to-end alignment")
weilong-guo
parents: 0
diff changeset
279
weilong-guo
parents: 0
diff changeset
280
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
281 tmp_path = tempfile.mkdtemp(prefix='bs_seeker2_%s_-%s-TMP-' % (os.path.split(outfilename)[1], aligner_title ), dir = options.temp_dir)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
282
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
283
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
284 (XS_x, XS_y) = options.XS_filter.split(",")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
285 XS_pct = float(XS_x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
286 XS_count = int(XS_y)
1
weilong-guo
parents: 0
diff changeset
287 logm('Filter for tag XS: #(mCH)/#(all CH)>%.2f%% and #(mCH)>%d' % (XS_pct*100, XS_count))
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
288
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
289
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
290 logm('Temporary directory: %s' % tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
291 logm('Reduced Representation Bisulfite Sequencing: %s' % str(options.rrbs))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
292 if options.infilename is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
293 logm('Single end')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
295 aligner_command = aligner_exec + aligner_options_string() + \
1
weilong-guo
parents: 0
diff changeset
296 { BOWTIE : ' -k 2 %(reference_genome)s -f %(input_file)s %(output_file)s',
weilong-guo
parents: 0
diff changeset
297 BOWTIE2 : ' -k 2 -x %(reference_genome)s -f -U %(input_file)s -S %(output_file)s',
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
298 SOAP : ' -D %(reference_genome)s.fa.index -o %(output_file)s -a %(input_file)s',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
299 RMAP : ' -c %(reference_genome)s.fa -o %(output_file)s %(input_file)s'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
300 }[options.aligner]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
301 logm ('Aligner command: %s' % aligner_command)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
302 # single end reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
303 if options.rrbs: # RRBS scan
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304 bs_rrbs(options.infilename,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
305 asktag,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
306 options.adapter_file,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
307 options.cutnumber1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
308 options.cutnumber2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
309 options.no_split,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
310 str_no_mismatches,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
311 aligner_command,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
312 db_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
313 tmp_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
314 outfile,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
315 XS_pct,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
316 XS_count,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
317 options.adapter_mismatch,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318 options.cut_format,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319 options.Output_multiple_hit
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
320 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
321 else: # Normal single end scan
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
322 bs_single_end( options.infilename,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323 asktag,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 options.adapter_file,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
325 options.cutnumber1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
326 options.cutnumber2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
327 options.no_split,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
328 str_no_mismatches,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
329 aligner_command,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
330 db_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
331 tmp_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
332 outfile,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
333 XS_pct,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
334 XS_count,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
335 options.adapter_mismatch,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
336 options.Output_multiple_hit
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
337 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
338 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
339 logm('Pair end')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
340 # pair end specific default options
1
weilong-guo
parents: 0
diff changeset
341 aligner_options = dict({BOWTIE: {'--fr' : True,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
342 '-X' : options.max_insert_size,
1
weilong-guo
parents: 0
diff changeset
343 '-I' : options.min_insert_size if options.min_insert_size > 0 else None,
weilong-guo
parents: 0
diff changeset
344 '-a' : True # "-k 2" in bowtie would not report the best two
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
345 },
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
346 BOWTIE2 : {
1
weilong-guo
parents: 0
diff changeset
347 '--fr' : True,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
348 '-X' : options.max_insert_size,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
349 '-I' : options.min_insert_size if options.min_insert_size > 0 else None,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
350 '--no-discordant' : True,
1
weilong-guo
parents: 0
diff changeset
351 '--no-mixed' : True,
weilong-guo
parents: 0
diff changeset
352 '-k' : 2
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
353 },
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
354 SOAP: {
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
355 '-x' : options.max_insert_size,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
356 '-m' : options.min_insert_size if options.min_insert_size > 0 else 100
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
357 }}[options.aligner],
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
358 # integrating 'rmappe' is different from others
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
359 **aligner_options)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
360
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
361 aligner_command = aligner_exec + aligner_options_string() + \
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
362 { BOWTIE : ' %(reference_genome)s -f -1 %(input_file_1)s -2 %(input_file_2)s %(output_file)s',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
363 BOWTIE2 : ' -x %(reference_genome)s -f -1 %(input_file_1)s -2 %(input_file_2)s -S %(output_file)s',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
364 SOAP : ' -D %(reference_genome)s.fa.index -o %(output_file)s -a %(input_file_1)s -b %(input_file_2)s -2 %(output_file)s.unpaired' #,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
365 # RMAP : # rmappe, also paste two inputs into one file.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
366 }[options.aligner]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
367
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
368 logm('Aligner command: %s' % aligner_command)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
369
1
weilong-guo
parents: 0
diff changeset
370 if '--end-to-end' not in aligner_options:
weilong-guo
parents: 0
diff changeset
371 aligner_options_defaults[BOWTIE2].update({'-D' : 50})
weilong-guo
parents: 0
diff changeset
372 else:
weilong-guo
parents: 0
diff changeset
373 aligner_options_defaults[BOWTIE2].update({'-D' : 50, '-L': 15, '--score-min': 'L,-0.6,-0.6' })
weilong-guo
parents: 0
diff changeset
374
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
375 bs_pair_end(options.infilename_1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
376 options.infilename_2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
377 asktag,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
378 options.adapter_file,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
379 options.cutnumber1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
380 options.cutnumber2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
381 options.no_split,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
382 str_no_mismatches,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
383 aligner_command,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
384 db_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
385 tmp_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
386 outfile,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
387 XS_pct,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
388 XS_count,
1
weilong-guo
parents: 0
diff changeset
389 options.adapter_mismatch,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
390 options.Output_multiple_hit
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
391 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
392
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
393 outfile.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
394