annotate long_reads_sampling.py @ 0:dd46956ff61f draft

Uploaded
author petr-novak
date Fri, 08 Dec 2017 09:57:17 -0500
parents
children ccaedca97e5e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
2 import sys
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
3 import argparse
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
4 import random
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
5 from argparse import ArgumentDefaultsHelpFormatter
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
6 from Bio import SeqIO
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
7 from long2short import get_sequences_summary
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
8
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
9
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
10 def get_args():
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
11 '''Parses command line arguments '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
12 description = ("Create sample of long reads, instead"
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
13 " of setting number of reads to be sampled,"
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
14 "total length of all sampled sequences is defined")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
15
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
16 parser = argparse.ArgumentParser(
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
17 description=description,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
18 formatter_class=ArgumentDefaultsHelpFormatter)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
19 parser.add_argument('-i',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
20 '--input',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
21 type=argparse.FileType('r'),
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
22 help="file with long reads in fasta format")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
23 parser.add_argument('-o',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
24 '--output',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
25 type=argparse.FileType('w'),
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
26 help="Output file name")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
27 parser.add_argument("-l",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
28 "--total_length",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
29 type=int,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
30 help="total length of sampled output")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
31
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
32 parser.add_argument("-s",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
33 "--seed",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
34 type=int,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
35 default=123,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
36 help="random number generator seed")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
37
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
38 args = parser.parse_args()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
39 if len(sys.argv) == 1:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
40 parser.print_help()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
41 sys.exit(1)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
42
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
43 return args
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
44
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
45
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
46 def make_sequence_sample(args, seq_summary):
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
47 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
48 create output sequence of required length or larger, save to fasta
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
49 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
50 random.seed(args.seed)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
51 print(args.seed)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
52 selected_seq_id = set()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
53 cummulative_length = 0
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
54 for seq_id in random.sample(seq_summary.id_length.keys(),
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
55 seq_summary.number_of_sequence):
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
56 selected_seq_id.add(seq_id)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
57 cummulative_length += seq_summary.id_length[seq_id]
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
58 if cummulative_length >= args.total_length:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
59 break
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
60 print(selected_seq_id)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
61 # to make it efficient same orger is necesary
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
62 selected_seq_id_ordered = [i
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
63 for i in seq_summary.id_length
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
64 if i in selected_seq_id]
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
65 curent_id = selected_seq_id_ordered.pop(0)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
66 with open(args.output.name, 'w') as output_seq_file:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
67 for seqs in SeqIO.parse(args.input.name, "fasta"):
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
68 if seqs.id == curent_id:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
69 SeqIO.write(seqs, output_seq_file, "fasta")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
70 try:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
71 curent_id = selected_seq_id_ordered.pop(0)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
72 except IndexError:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
73 break
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
74
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
75
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
76 def main():
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
77 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
78 Create sample of longn reads, sample size is defined as total length
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
79 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
80 args = get_args()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
81 seq_summary = get_sequences_summary(args.input)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
82 # check that total length if larger than required sampling
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
83 if seq_summary.total_length < args.total_length:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
84 print("Input sequences total length: ",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
85 seq_summary.total_length,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
86 file=sys.stderr)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
87 print("Required output total length: ",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
88 args.total_length,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
89 file=sys.stderr)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
90 print("Required sequence length is larger than input data, exiting",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
91 file=sys.stderr)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
92 sys.exit(1)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
93
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
94 make_sequence_sample(args, seq_summary)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
95
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
96
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
97 if __name__ == "__main__":
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
98 main()