comparison mismatch_frequencies.py @ 2:2974c382105c draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
author mvdbeek
date Sat, 22 Dec 2018 04:15:47 -0500
parents 77de5fc623f9
children
comparison
equal deleted inserted replaced
1:3613460e891e 2:2974c382105c
1 import pysam, re, string 1 import re
2 import matplotlib.pyplot as plt 2 import string
3 import pysam
4 import matplotlib
3 import pandas as pd 5 import pandas as pd
4 import json
5 from collections import defaultdict 6 from collections import defaultdict
6 from collections import OrderedDict 7 from collections import OrderedDict
7 import argparse 8 import argparse
8 import itertools 9 import itertools
9 10
11 matplotlib.use('pdf')
12 import matplotlib.pyplot as plt # noqa: E402
13
14
10 class MismatchFrequencies: 15 class MismatchFrequencies:
11 '''Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One 16 '''Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One
12 class instance per alignment file. The result_dict attribute will contain a 17 class instance per alignment file. The result_dict attribute will contain a
13 nested dictionary with name, readlength and mismatch count.''' 18 nested dictionary with name, readlength and mismatch count.'''
14 def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21, 19 def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21,
15 maximal_readlength=21, 20 maximal_readlength=21,
16 number_of_allowed_mismatches=1, 21 number_of_allowed_mismatches=1,
17 ignore_5p_nucleotides=0, 22 ignore_5p_nucleotides=0,
18 ignore_3p_nucleotides=0, 23 ignore_3p_nucleotides=0,
19 possible_mismatches = [ 24 possible_mismatches=[
20 'AC', 'AG', 'AT', 25 'AC', 'AG', 'AT',
21 'CA', 'CG', 'CT', 26 'CA', 'CG', 'CT',
22 'GA', 'GC', 'GT', 27 'GA', 'GC', 'GT',
23 'TA', 'TC', 'TG' 28 'TA', 'TC', 'TG'
24 ]): 29 ]):
25 30
26 self.result_dict = result_dict 31 self.result_dict = result_dict
27 self.name = name 32 self.name = name
28 self.minimal_readlength = minimal_readlength 33 self.minimal_readlength = minimal_readlength
29 self.maximal_readlength = maximal_readlength 34 self.maximal_readlength = maximal_readlength
30 self.number_of_allowed_mismatches = number_of_allowed_mismatches 35 self.number_of_allowed_mismatches = number_of_allowed_mismatches
31 self.ignore_5p_nucleotides = ignore_5p_nucleotides 36 self.ignore_5p_nucleotides = ignore_5p_nucleotides
32 self.ignore_3p_nucleotides = ignore_3p_nucleotides 37 self.ignore_3p_nucleotides = ignore_3p_nucleotides
33 self.possible_mismatches = possible_mismatches 38 self.possible_mismatches = possible_mismatches
34 39
35 if alignment_file: 40 if alignment_file:
36 self.pysam_alignment = pysam.Samfile(alignment_file) 41 self.pysam_alignment = pysam.Samfile(alignment_file)
37 self.references = self.pysam_alignment.references #names of fasta reference sequences 42 self.references = self.pysam_alignment.references # names of fasta reference sequences
38 result_dict[name]=self.get_mismatches( 43 result_dict[name] = self.get_mismatches(
39 self.pysam_alignment, 44 self.pysam_alignment,
40 minimal_readlength, 45 minimal_readlength,
41 maximal_readlength, 46 maximal_readlength,
42 possible_mismatches 47 possible_mismatches
43 ) 48 )
44 49
45 def get_mismatches(self, pysam_alignment, minimal_readlength, 50 def get_mismatches(self, pysam_alignment, minimal_readlength,
46 maximal_readlength, possible_mismatches): 51 maximal_readlength, possible_mismatches):
47 mismatch_dict = defaultdict(int)
48 rec_dd = lambda: defaultdict(rec_dd) 52 rec_dd = lambda: defaultdict(rec_dd)
49 len_dict = rec_dd() 53 len_dict = rec_dd()
50 for alignedread in pysam_alignment: 54 for alignedread in pysam_alignment:
51 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): 55 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength):
52 chromosome = pysam_alignment.getrname(alignedread.rname) 56 chromosome = pysam_alignment.getrname(alignedread.rname)
54 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] += 1 58 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] += 1
55 except TypeError: 59 except TypeError:
56 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] = 1 60 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] = 1
57 MD = alignedread.opt('MD') 61 MD = alignedread.opt('MD')
58 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): 62 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches):
59 (ref_base, mismatch_base)=self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) 63 (ref_base, mismatch_base) = self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse)
60 if ref_base == None: 64 if not ref_base:
61 continue 65 continue
62 else: 66 else:
63 for i, base in enumerate(ref_base): 67 for i, base in enumerate(ref_base):
64 if not ref_base[i]+mismatch_base[i] in possible_mismatches: 68 if not ref_base[i]+mismatch_base[i] in possible_mismatches:
65 continue 69 continue
66 try: 70 try:
67 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] += 1 71 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] += 1
68 except TypeError: 72 except TypeError:
69 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] = 1 73 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] = 1
70 return len_dict 74 return len_dict
71 75
72 def read_is_valid(self, read, min_readlength, max_readlength): 76 def read_is_valid(self, read, min_readlength, max_readlength):
73 '''Filter out reads that are unmatched, too short or 77 '''Filter out reads that are unmatched, too short or
74 too long or that contian insertions''' 78 too long or that contian insertions'''
75 if read.is_unmapped: 79 if read.is_unmapped:
76 return False 80 return False
78 return False 82 return False
79 if read.rlen > max_readlength: 83 if read.rlen > max_readlength:
80 return False 84 return False
81 else: 85 else:
82 return True 86 return True
83 87
84 def read_has_mismatch(self, read, number_of_allowed_mismatches=1): 88 def read_has_mismatch(self, read, number_of_allowed_mismatches=1):
85 '''keep only reads with one mismatch. Could be simplified''' 89 '''keep only reads with one mismatch. Could be simplified'''
86 NM=read.opt('NM') 90 NM = read.opt('NM')
87 if NM <1: #filter out reads with no mismatch 91 if NM < 1: # filter out reads with no mismatch
88 return False 92 return False
89 if NM >number_of_allowed_mismatches: #filter out reads with more than 1 mismtach 93 if NM > number_of_allowed_mismatches: # filter out reads with more than 1 mismtach
90 return False 94 return False
91 else: 95 else:
92 return True 96 return True
93 97
94 def mismatch_in_allowed_region(self, readseq, mismatch_position): 98 def mismatch_in_allowed_region(self, readseq, mismatch_position):
95 ''' 99 '''
96 >>> M = MismatchFrequencies() 100 >>> M = MismatchFrequencies()
97 >>> readseq = 'AAAAAA' 101 >>> readseq = 'AAAAAA'
98 >>> mismatch_position = 2 102 >>> mismatch_position = 2
106 >>> readseq = 'AAAAAA' 110 >>> readseq = 'AAAAAA'
107 >>> mismatch_position = 4 111 >>> mismatch_position = 4
108 >>> M.mismatch_in_allowed_region(readseq, mismatch_position) 112 >>> M.mismatch_in_allowed_region(readseq, mismatch_position)
109 False 113 False
110 ''' 114 '''
111 mismatch_position+=1 # To compensate for starting the count at 0 115 mismatch_position += 1 # To compensate for starting the count at 0
112 five_p = self.ignore_5p_nucleotides 116 five_p = self.ignore_5p_nucleotides
113 three_p = self.ignore_3p_nucleotides 117 three_p = self.ignore_3p_nucleotides
114 if any([five_p > 0, three_p > 0]): 118 if any([five_p > 0, three_p > 0]):
115 if any([mismatch_position <= five_p, 119 if any([mismatch_position <= five_p,
116 mismatch_position >= (len(readseq)+1-three_p)]): #Again compensate for starting the count at 0 120 mismatch_position >= (len(readseq) + 1 - three_p)]): # Again compensate for starting the count at 0
117 return False 121 return False
118 else: 122 else:
119 return True 123 return True
120 else: 124 else:
121 return True 125 return True
122 126
123 def read_to_reference_mismatch(self, MD, readseq, is_reverse): 127 def read_to_reference_mismatch(self, MD, readseq, is_reverse):
124 ''' 128 '''
125 This is where the magic happens. The MD tag contains SNP and indel information, 129 This is where the magic happens. The MD tag contains SNP and indel information,
126 without looking to the genome sequence. This is a typical MD tag: 3C0G2A6. 130 without looking to the genome sequence. This is a typical MD tag: 3C0G2A6.
127 3 bases of the read align to the reference, followed by a mismatch, where the 131 3 bases of the read align to the reference, followed by a mismatch, where the
128 reference base is C, followed by 10 bases aligned to the reference. 132 reference base is C, followed by 10 bases aligned to the reference.
129 suppose a reference 'CTTCGATAATCCTT' 133 suppose a reference 'CTTCGATAATCCTT'
130 ||| || |||||| 134 ||| || ||||||
131 and a read 'CTTATATTATCCTT'. 135 and a read 'CTTATATTATCCTT'.
132 This situation is represented by the above MD tag. 136 This situation is represented by the above MD tag.
133 Given MD tag and read sequence this function returns the reference base C, G and A, 137 Given MD tag and read sequence this function returns the reference base C, G and A,
134 and the mismatched base A, T, T. 138 and the mismatched base A, T, T.
135 >>> M = MismatchFrequencies() 139 >>> M = MismatchFrequencies()
136 >>> MD='3C0G2A7' 140 >>> MD='3C0G2A7'
137 >>> seq='CTTATATTATCCTT' 141 >>> seq='CTTATATTATCCTT'
138 >>> result=M.read_to_reference_mismatch(MD, seq, is_reverse=False) 142 >>> result=M.read_to_reference_mismatch(MD, seq, is_reverse=False)
139 >>> result[0]=="CGA" 143 >>> result[0]=="CGA"
140 True 144 True
141 >>> result[1]=="ATT" 145 >>> result[1]=="ATT"
142 True 146 True
143 >>> 147 >>>
144 ''' 148 '''
145 search=re.finditer('[ATGC]',MD) 149 search = re.finditer('[ATGC]', MD)
146 if '^' in MD: 150 if '^' in MD:
147 print 'WARNING insertion detected, mismatch calling skipped for this read!!!' 151 print 'WARNING insertion detected, mismatch calling skipped for this read!!!'
148 return (None, None) 152 return (None, None)
149 start_index=0 # refers to the leading integer of the MD string before an edited base 153 start_index = 0 # refers to the leading integer of the MD string before an edited base
150 current_position=0 # position of the mismatched nucleotide in the MD tag string 154 current_position = 0 # position of the mismatched nucleotide in the MD tag string
151 mismatch_position=0 # position of edited base in current read 155 mismatch_position = 0 # position of edited base in current read
152 reference_base="" 156 reference_base = ""
153 mismatched_base="" 157 mismatched_base = ""
154 for result in search: 158 for result in search:
155 current_position=result.start() 159 current_position = result.start()
156 mismatch_position=mismatch_position+1+int(MD[start_index:current_position]) #converts the leading characters before an edited base into integers 160 mismatch_position = mismatch_position + 1 + int(MD[start_index:current_position]) # converts the leading characters before an edited base into integers
157 start_index=result.end() 161 start_index = result.end()
158 reference_base+=MD[result.end()-1] 162 reference_base += MD[result.end() - 1]
159 mismatched_base+=readseq[mismatch_position-1] 163 mismatched_base += readseq[mismatch_position - 1]
160 if is_reverse: 164 if is_reverse:
161 reference_base=reverseComplement(reference_base) 165 reference_base = reverseComplement(reference_base)
162 mismatched_base=reverseComplement(mismatched_base) 166 mismatched_base = reverseComplement(mismatched_base)
163 mismatch_position=len(readseq)-mismatch_position-1 167 mismatch_position = len(readseq)-mismatch_position-1
164 if mismatched_base=='N': 168 if mismatched_base == 'N':
165 return (None, None) 169 return (None, None)
166 if self.mismatch_in_allowed_region(readseq, mismatch_position): 170 if self.mismatch_in_allowed_region(readseq, mismatch_position):
167 return (reference_base, mismatched_base) 171 return (reference_base, mismatched_base)
168 else: 172 else:
169 return (None, None) 173 return (None, None)
170 174
175
171 def reverseComplement(sequence): 176 def reverseComplement(sequence):
172 '''do a reverse complement of DNA base. 177 '''do a reverse complement of DNA base.
173 >>> reverseComplement('ATGC')=='GCAT' 178 >>> reverseComplement('ATGC')=='GCAT'
174 True 179 True
175 >>> 180 >>>
176 ''' 181 '''
177 sequence=sequence.upper() 182 sequence = sequence.upper()
178 complement = string.maketrans('ATCGN', 'TAGCN') 183 complement = string.maketrans('ATCGN', 'TAGCN')
179 return sequence.upper().translate(complement)[::-1] 184 return sequence.upper().translate(complement)[::-1]
180 185
186
181 def barplot(df, library, axes): 187 def barplot(df, library, axes):
182 df.plot(kind='bar', ax=axes, subplots=False,\ 188 df.plot(kind='bar', ax=axes, subplots=False,
183 stacked=False, legend='test',\ 189 stacked=False, legend='test',
184 title='Mismatch frequencies for {0}'.format(library)) 190 title='Mismatch frequencies for {0}'.format(library))
185 191
192
186 def df_to_tab(df, output): 193 def df_to_tab(df, output):
187 df.to_csv(output, sep='\t') 194 df.to_csv(output, sep='\t')
195
188 196
189 def reduce_result(df, possible_mismatches): 197 def reduce_result(df, possible_mismatches):
190 '''takes a pandas dataframe with full mismatch details and 198 '''takes a pandas dataframe with full mismatch details and
191 summarises the results for plotting.''' 199 summarises the results for plotting.'''
192 alignments = df['Alignment_file'].unique() 200 alignments = df['Alignment_file'].unique()
193 readlengths = df['Readlength'].unique() 201 readlengths = df['Readlength'].unique()
194 combinations = itertools.product(*[alignments, readlengths]) #generate all possible combinations of readlength and alignment files 202 combinations = itertools.product(*[alignments, readlengths]) # generate all possible combinations of readlength and alignment files
195 reduced_dict = {} 203 reduced_dict = {}
196 frames = [] 204 last_column = 3 + len(possible_mismatches)
197 last_column = 3+len(possible_mismatches)
198 for combination in combinations: 205 for combination in combinations:
199 library_subset = df[df['Alignment_file'] == combination[0]] 206 library_subset = df[df['Alignment_file'] == combination[0]]
200 library_readlength_subset = library_subset[library_subset['Readlength'] == combination[1]] 207 library_readlength_subset = library_subset[library_subset['Readlength'] == combination[1]]
201 sum_of_library_and_readlength = library_readlength_subset.iloc[:,3:last_column+1].sum() 208 sum_of_library_and_readlength = library_readlength_subset.iloc[:, 3:last_column+1].sum()
202 if not reduced_dict.has_key(combination[0]): 209 if combination[0] not in reduced_dict:
203 reduced_dict[combination[0]] = {} 210 reduced_dict[combination[0]] = {}
204 reduced_dict[combination[0]][combination[1]] = sum_of_library_and_readlength.to_dict() 211 reduced_dict[combination[0]][combination[1]] = sum_of_library_and_readlength.to_dict()
205 return reduced_dict 212 return reduced_dict
206 213
214
207 def plot_result(reduced_dict, args): 215 def plot_result(reduced_dict, args):
208 names=reduced_dict.keys() 216 names = reduced_dict.keys()
209 nrows=len(names)/2+1 217 nrows = len(names) / 2 + 1
210 fig = plt.figure(figsize=(16,32)) 218 fig = plt.figure(figsize=(16, 32))
211 for i,library in enumerate (names): 219 for i, library in enumerate(names):
212 axes=fig.add_subplot(nrows,2,i+1) 220 axes = fig.add_subplot(nrows, 2, i+1)
213 library_dict=reduced_dict[library] 221 library_dict = reduced_dict[library]
214 df=pd.DataFrame(library_dict) 222 df = pd.DataFrame(library_dict)
215 df.drop(['total aligned reads'], inplace=True) 223 df.drop(['total aligned reads'], inplace=True)
216 barplot(df, library, axes), 224 barplot(df, library, axes),
217 axes.set_ylabel('Mismatch count / all valid reads * readlength') 225 axes.set_ylabel('Mismatch count / all valid reads * readlength')
218 fig.savefig(args.output_pdf, format='pdf') 226 fig.savefig(args.output_pdf, format='pdf')
227
219 228
220 def format_result_dict(result_dict, chromosomes, possible_mismatches): 229 def format_result_dict(result_dict, chromosomes, possible_mismatches):
221 '''Turn nested dictionary into preformatted tab seperated lines''' 230 '''Turn nested dictionary into preformatted tab seperated lines'''
222 header = "Reference sequence\tAlignment_file\tReadlength\t" + "\t".join( 231 header = "Reference sequence\tAlignment_file\tReadlength\t" + "\t".join(
223 possible_mismatches) + "\ttotal aligned reads" 232 possible_mismatches) + "\ttotal aligned reads"
235 except KeyError: 244 except KeyError:
236 line.extend([0 for mismatch in possible_mismatches]) 245 line.extend([0 for mismatch in possible_mismatches])
237 line.extend([0]) 246 line.extend([0])
238 result.append(line) 247 result.append(line)
239 df = pd.DataFrame(result, columns=header.split('\t')) 248 df = pd.DataFrame(result, columns=header.split('\t'))
240 last_column=3+len(possible_mismatches) 249 last_column = 3 + len(possible_mismatches)
241 df['mismatches/per aligned nucleotides'] = df.iloc[:,3:last_column].sum(1)/(df.iloc[:,last_column]*df['Readlength']) 250 df['mismatches/per aligned nucleotides'] = df.iloc[:, 3:last_column].sum(1)/(df.iloc[:, last_column] * df['Readlength'])
242 return df 251 return df
243 252
253
244 def setup_MismatchFrequencies(args): 254 def setup_MismatchFrequencies(args):
245 resultDict=OrderedDict() 255 resultDict = OrderedDict()
246 kw_list=[{'result_dict' : resultDict, 256 kw_list = [{'result_dict': resultDict,
247 'alignment_file' :alignment_file, 257 'alignment_file': alignment_file,
248 'name' : name, 258 'name': name,
249 'minimal_readlength' : args.min, 259 'minimal_readlength': args.min,
250 'maximal_readlength' : args.max, 260 'maximal_readlength': args.max,
251 'number_of_allowed_mismatches' : args.n_mm, 261 'number_of_allowed_mismatches': args.n_mm,
252 'ignore_5p_nucleotides' : args.five_p, 262 'ignore_5p_nucleotides': args.five_p,
253 'ignore_3p_nucleotides' : args.three_p, 263 'ignore_3p_nucleotides': args.three_p,
254 'possible_mismatches' : args.possible_mismatches } 264 'possible_mismatches': args.possible_mismatches}
255 for alignment_file, name in zip(args.input, args.name)] 265 for alignment_file, name in zip(args.input, args.name)]
256 return (kw_list, resultDict) 266 return (kw_list, resultDict)
267
257 268
258 def nested_dict_to_df(dictionary): 269 def nested_dict_to_df(dictionary):
259 dictionary = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.iteritems() for innerKey, values in innerDict.iteritems()} 270 dictionary = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.iteritems() for innerKey, values in innerDict.iteritems()}
260 df=pd.DataFrame.from_dict(dictionary).transpose() 271 df = pd.DataFrame.from_dict(dictionary).transpose()
261 df.index.names = ['Library', 'Readlength'] 272 df.index.names = ['Library', 'Readlength']
262 return df 273 return df
263 274
275
264 def run_MismatchFrequencies(args): 276 def run_MismatchFrequencies(args):
265 kw_list, resultDict=setup_MismatchFrequencies(args) 277 kw_list, resultDict = setup_MismatchFrequencies(args)
266 references = [MismatchFrequencies(**kw_dict).references for kw_dict in kw_list] 278 references = [MismatchFrequencies(**kw_dict).references for kw_dict in kw_list]
267 return (resultDict, references[0]) 279 return (resultDict, references[0])
280
268 281
269 def main(): 282 def main():
270 result_dict, references = run_MismatchFrequencies(args) 283 result_dict, references = run_MismatchFrequencies(args)
271 df = format_result_dict(result_dict, references, args.possible_mismatches) 284 df = format_result_dict(result_dict, references, args.possible_mismatches)
272 reduced_dict = reduce_result(df, args.possible_mismatches) 285 reduced_dict = reduce_result(df, args.possible_mismatches)
273 plot_result(reduced_dict, args) 286 plot_result(reduced_dict, args)
274 reduced_df = nested_dict_to_df(reduced_dict) 287 reduced_df = nested_dict_to_df(reduced_dict)
275 df_to_tab(reduced_df, args.output_tab) 288 df_to_tab(reduced_df, args.output_tab)
276 if not args.expanded_output_tab == None: 289 if args.expanded_output_tab:
277 df_to_tab(df, args.expanded_output_tab) 290 df_to_tab(df, args.expanded_output_tab)
278 return reduced_dict 291 return reduced_dict
279 292
280 if __name__ == "__main__": 293 if __name__ == "__main__":
281 294
282 parser = argparse.ArgumentParser(description='Produce mismatch statistics for BAM/SAM alignment files.') 295 parser = argparse.ArgumentParser(description='Produce mismatch statistics for BAM/SAM alignment files.')
283 parser.add_argument('--input', nargs='*', help='Input files in SAM/BAM format') 296 parser.add_argument('--input', nargs='*', help='Input files in SAM/BAM format')
284 parser.add_argument('--name', nargs='*', help='Name for input file to display in output file. Should have same length as the number of inputs') 297 parser.add_argument('--name', nargs='*', help='Name for input file to display in output file. Should have same length as the number of inputs')
285 parser.add_argument('--output_pdf', help='Output filename for graph') 298 parser.add_argument('--output_pdf', help='Output filename for graph')
286 parser.add_argument('--output_tab', help='Output filename for table') 299 parser.add_argument('--output_tab', help='Output filename for table')
287 parser.add_argument('--expanded_output_tab', default=None, help='Output filename for table') 300 parser.add_argument('--expanded_output_tab', default=None, help='Output filename for table')
288 parser.add_argument('--possible_mismatches', default=[ 301 parser.add_argument('--possible_mismatches', default=[
289 'AC', 'AG', 'AT','CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG' 302 'AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG'
290 ], nargs='+', help='specify mismatches that should be counted for the mismatch frequency. The format is Reference base -> observed base, eg AG for A to G mismatches.') 303 ], nargs='+', help='specify mismatches that should be counted for the mismatch frequency. The format is Reference base -> observed base, eg AG for A to G mismatches.')
291 parser.add_argument('--min', '--minimal_readlength', type=int, help='minimum readlength') 304 parser.add_argument('--min', '--minimal_readlength', type=int, help='minimum readlength')
292 parser.add_argument('--max', '--maximal_readlength', type=int, help='maximum readlength') 305 parser.add_argument('--max', '--maximal_readlength', type=int, help='maximum readlength')
293 parser.add_argument('--n_mm', '--number_allowed_mismatches', type=int, default=1, help='discard reads with more than n mismatches') 306 parser.add_argument('--n_mm', '--number_allowed_mismatches', type=int, default=1, help='discard reads with more than n mismatches')
294 parser.add_argument('--five_p', '--ignore_5p_nucleotides', type=int, default=0, help='when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read') 307 parser.add_argument('--five_p', '--ignore_5p_nucleotides', type=int, default=0, help='when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read')
295 parser.add_argument('--three_p', '--ignore_3p_nucleotides', type=int, default=1, help='when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read') 308 parser.add_argument('--three_p', '--ignore_3p_nucleotides', type=int, default=1, help='when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read')
296 #args = parser.parse_args(['--input', '3mismatches_ago2ip_s2.bam', '3mismatches_ago2ip_ovary.bam','--possible_mismatches','AC','AG', 'CG', 'TG', 'CT','--name', 'Siomi1', 'Siomi2' , '--five_p', '3','--three_p','3','--output_pdf', 'out.pdf', '--output_tab', 'out.tab', '--expanded_output_tab', 'expanded.tab', '--min', '20', '--max', '22']) 309 # args = parser.parse_args(['--input', '3mismatches_ago2ip_s2.bam', '3mismatches_ago2ip_ovary.bam','--possible_mismatches','AC','AG', 'CG', 'TG', 'CT','--name', 'Siomi1', 'Siomi2' , '--five_p', '3','--three_p','3','--output_pdf', 'out.pdf', '--output_tab', 'out.tab', '--expanded_output_tab', 'expanded.tab', '--min', '20', '--max', '22'])
297 args = parser.parse_args() 310 args = parser.parse_args()
298 reduced_dict = main() 311 reduced_dict = main()
299
300