0
|
1 #!/usr/bin/env python3
|
|
2 import sys
|
|
3 import argparse
|
|
4 import random
|
|
5 from argparse import ArgumentDefaultsHelpFormatter
|
|
6 from Bio import SeqIO
|
|
7 from long2short import get_sequences_summary
|
|
8
|
|
9
|
|
10 def get_args():
|
|
11 '''Parses command line arguments '''
|
|
12 description = ("Create sample of long reads, instead"
|
|
13 " of setting number of reads to be sampled,"
|
|
14 "total length of all sampled sequences is defined")
|
|
15
|
|
16 parser = argparse.ArgumentParser(
|
|
17 description=description,
|
|
18 formatter_class=ArgumentDefaultsHelpFormatter)
|
|
19 parser.add_argument('-i',
|
|
20 '--input',
|
|
21 type=argparse.FileType('r'),
|
|
22 help="file with long reads in fasta format")
|
|
23 parser.add_argument('-o',
|
|
24 '--output',
|
|
25 type=argparse.FileType('w'),
|
|
26 help="Output file name")
|
|
27 parser.add_argument("-l",
|
|
28 "--total_length",
|
|
29 type=int,
|
|
30 help="total length of sampled output")
|
|
31
|
|
32 parser.add_argument("-s",
|
|
33 "--seed",
|
|
34 type=int,
|
|
35 default=123,
|
|
36 help="random number generator seed")
|
|
37
|
|
38 args = parser.parse_args()
|
|
39 if len(sys.argv) == 1:
|
|
40 parser.print_help()
|
|
41 sys.exit(1)
|
|
42
|
|
43 return args
|
|
44
|
|
45
|
|
46 def make_sequence_sample(args, seq_summary):
|
|
47 '''
|
|
48 create output sequence of required length or larger, save to fasta
|
|
49 '''
|
|
50 random.seed(args.seed)
|
|
51 print(args.seed)
|
|
52 selected_seq_id = set()
|
|
53 cummulative_length = 0
|
|
54 for seq_id in random.sample(seq_summary.id_length.keys(),
|
|
55 seq_summary.number_of_sequence):
|
|
56 selected_seq_id.add(seq_id)
|
|
57 cummulative_length += seq_summary.id_length[seq_id]
|
|
58 if cummulative_length >= args.total_length:
|
|
59 break
|
|
60 print(selected_seq_id)
|
|
61 # to make it efficient same orger is necesary
|
|
62 selected_seq_id_ordered = [i
|
|
63 for i in seq_summary.id_length
|
|
64 if i in selected_seq_id]
|
|
65 curent_id = selected_seq_id_ordered.pop(0)
|
|
66 with open(args.output.name, 'w') as output_seq_file:
|
|
67 for seqs in SeqIO.parse(args.input.name, "fasta"):
|
|
68 if seqs.id == curent_id:
|
|
69 SeqIO.write(seqs, output_seq_file, "fasta")
|
|
70 try:
|
|
71 curent_id = selected_seq_id_ordered.pop(0)
|
|
72 except IndexError:
|
|
73 break
|
|
74
|
|
75
|
|
76 def main():
|
|
77 '''
|
|
78 Create sample of longn reads, sample size is defined as total length
|
|
79 '''
|
|
80 args = get_args()
|
|
81 seq_summary = get_sequences_summary(args.input)
|
|
82 # check that total length if larger than required sampling
|
|
83 if seq_summary.total_length < args.total_length:
|
|
84 print("Input sequences total length: ",
|
|
85 seq_summary.total_length,
|
|
86 file=sys.stderr)
|
|
87 print("Required output total length: ",
|
|
88 args.total_length,
|
|
89 file=sys.stderr)
|
|
90 print("Required sequence length is larger than input data, exiting",
|
|
91 file=sys.stderr)
|
|
92 sys.exit(1)
|
|
93
|
|
94 make_sequence_sample(args, seq_summary)
|
|
95
|
|
96
|
|
97 if __name__ == "__main__":
|
|
98 main()
|