0
|
1 #!/usr/bin/env python3
|
|
2 import argparse
|
|
3 import sys
|
|
4 from argparse import ArgumentDefaultsHelpFormatter
|
|
5 from collections import namedtuple
|
|
6 from collections import OrderedDict
|
|
7 from Bio import SeqIO
|
|
8
|
|
9 SEQ_FORMAT = "fasta"
|
|
10 # Default sampling, used on argparse:
|
|
11 DEFAULT_READ_LENGTH = 200
|
|
12 DEFAULT_INSERT_LENGTH = 700
|
|
13 DEFAULT_COVERAGE = 0.1
|
|
14
|
|
15 Sequences_summary = namedtuple('Fasta_summary',
|
|
16 ['total_length', 'number_of_sequence',
|
|
17 'id_length', 'file_path', 'format'])
|
|
18
|
|
19 Coordinates = namedtuple('Coordinates', "id start1 end1 start2 end2")
|
|
20
|
|
21
|
|
22 def get_sequences_summary(seq_file):
|
|
23 ''' return basic characteristic of sequences '''
|
|
24 id_length = OrderedDict()
|
|
25 totat_length = 0
|
|
26 N = 0
|
|
27 for seqs in SeqIO.parse(seq_file, SEQ_FORMAT):
|
|
28 id_length[seqs.id] = len(seqs)
|
|
29 totat_length += len(seqs)
|
|
30 N += 1
|
|
31 return Sequences_summary(totat_length, N, id_length, seq_file, SEQ_FORMAT)
|
|
32
|
|
33
|
|
34 def get_short_pseudoreads_position(fasta_summary, sampling_options):
|
|
35 """Return selected position on long read sequences
|
|
36 Arguments:
|
|
37 fasta_summary - namedtuple Fasta_summaty containing information about sequences
|
|
38 sampling options - namedtuple, specified how sequences should be sampled
|
|
39 Return value:
|
|
40 (sequence_id, start1, end1, start2, end2)
|
|
41 """
|
|
42 interval = int(2 * sampling_options.read_length /
|
|
43 sampling_options.coverage)
|
|
44 for seqname, length in fasta_summary.id_length.items():
|
|
45 start_positions = range(1, length, interval)
|
|
46 for s in start_positions:
|
|
47 yield Coordinates(seqname, s, s + sampling_options.read_length,
|
|
48 s + sampling_options.insert_length -
|
|
49 sampling_options.read_length,
|
|
50 s + sampling_options.insert_length)
|
|
51
|
|
52
|
|
53 def extract_short_reads(summary, args):
|
|
54 '''yield short reades sampled from long reads
|
|
55 Arguments:
|
|
56 summary.. named tuple specifie sequences properties, path, length, idslist
|
|
57 args ..... Define how short sequences should be generated
|
|
58 '''
|
|
59 pos = get_short_pseudoreads_position(summary, args)
|
|
60 coords = next(pos)
|
|
61 index = 0
|
|
62 for i in SeqIO.parse(summary.file_path, summary.format):
|
|
63 index += 1
|
|
64 while True:
|
|
65 if coords.id == i.id:
|
|
66 # forward read
|
|
67 subseq_f = i[coords.start1:coords.end1]
|
|
68 subseq_f.id = "{}_{}_{}_f".format(index, coords.start1,
|
|
69 coords.end1)
|
|
70 subseq_f.description = ""
|
|
71 # reverse complement read
|
|
72 subseq_r = i[coords.start2:coords.end2].reverse_complement()
|
|
73 subseq_r.id = "{}_{}_{}_r".format(index, coords.start1,
|
|
74 coords.end1)
|
|
75 subseq_r.description = ""
|
|
76 # return only if sequences are long enough
|
|
77 if len(subseq_r) == args.read_length:
|
|
78 yield subseq_f
|
|
79 yield subseq_r
|
|
80 coords = next(pos)
|
|
81 else:
|
|
82 break
|
|
83
|
|
84
|
|
85 def long2short(args):
|
|
86 '''Sample short reads from long sequences
|
|
87 args contain these attributes::
|
|
88 ------------
|
|
89 input_file - path to file in fasta format
|
|
90 output_file - path to output file, fasta format
|
|
91 options - options is named tuple and specifies read length
|
|
92 coverage, insert length, max number of sequences which will be return
|
|
93
|
|
94 '''
|
|
95 summary = get_sequences_summary(args.input.name)
|
|
96 with open(args.output.name, 'w') as f:
|
|
97 for i in extract_short_reads(summary, args):
|
|
98 SeqIO.write(i, f, SEQ_FORMAT)
|
|
99
|
|
100
|
|
101 def get_args():
|
|
102 '''Parses command line arguments '''
|
|
103 description = "Creates pseudo short reads from long oxford nanopore reads"
|
|
104 parser = argparse.ArgumentParser(
|
|
105 description=description,
|
|
106 formatter_class=ArgumentDefaultsHelpFormatter)
|
|
107 parser.add_argument('-i',
|
|
108 '--input',
|
|
109 type=argparse.FileType('r'),
|
|
110 help="file with long reads in fasta format")
|
|
111 parser.add_argument('-o',
|
|
112 '--output',
|
|
113 type=argparse.FileType('w'),
|
|
114 help="Output file name")
|
|
115 parser.add_argument("-cov",
|
|
116 "--coverage",
|
|
117 type=float,
|
|
118 default=DEFAULT_COVERAGE,
|
|
119 help="samplig coverage")
|
|
120 parser.add_argument(
|
|
121 "-L",
|
|
122 "--insert_length",
|
|
123 type=int,
|
|
124 default=DEFAULT_INSERT_LENGTH,
|
|
125 help="length of insert, must be longer than read length")
|
|
126 parser.add_argument("-l",
|
|
127 "--read_length",
|
|
128 type=int,
|
|
129 default=DEFAULT_READ_LENGTH,
|
|
130 help="read length")
|
|
131
|
|
132 args = parser.parse_args()
|
|
133 if len(sys.argv) == 1:
|
|
134 parser.print_help()
|
|
135 sys.exit(1)
|
|
136
|
|
137 #hassert args.insert_length > args.read_length, "read length must be shorter than insert length"
|
|
138 return args
|
|
139
|
|
140
|
|
141 def main():
|
|
142 '''Sample short reads from long sequences
|
|
143 Files path are passed as command line positional arguments
|
|
144 '''
|
|
145 args = get_args()
|
|
146 long2short(args)
|
|
147
|
|
148
|
|
149 if __name__ == "__main__":
|
|
150 main()
|