Mercurial > repos > petr-novak > long_reads_sampling
comparison long_reads_sampling.py @ 0:dd46956ff61f draft
Uploaded
author | petr-novak |
---|---|
date | Fri, 08 Dec 2017 09:57:17 -0500 |
parents | |
children | ccaedca97e5e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dd46956ff61f |
---|---|
1 #!/usr/bin/env python3 | |
2 import sys | |
3 import argparse | |
4 import random | |
5 from argparse import ArgumentDefaultsHelpFormatter | |
6 from Bio import SeqIO | |
7 from long2short import get_sequences_summary | |
8 | |
9 | |
10 def get_args(): | |
11 '''Parses command line arguments ''' | |
12 description = ("Create sample of long reads, instead" | |
13 " of setting number of reads to be sampled," | |
14 "total length of all sampled sequences is defined") | |
15 | |
16 parser = argparse.ArgumentParser( | |
17 description=description, | |
18 formatter_class=ArgumentDefaultsHelpFormatter) | |
19 parser.add_argument('-i', | |
20 '--input', | |
21 type=argparse.FileType('r'), | |
22 help="file with long reads in fasta format") | |
23 parser.add_argument('-o', | |
24 '--output', | |
25 type=argparse.FileType('w'), | |
26 help="Output file name") | |
27 parser.add_argument("-l", | |
28 "--total_length", | |
29 type=int, | |
30 help="total length of sampled output") | |
31 | |
32 parser.add_argument("-s", | |
33 "--seed", | |
34 type=int, | |
35 default=123, | |
36 help="random number generator seed") | |
37 | |
38 args = parser.parse_args() | |
39 if len(sys.argv) == 1: | |
40 parser.print_help() | |
41 sys.exit(1) | |
42 | |
43 return args | |
44 | |
45 | |
46 def make_sequence_sample(args, seq_summary): | |
47 ''' | |
48 create output sequence of required length or larger, save to fasta | |
49 ''' | |
50 random.seed(args.seed) | |
51 print(args.seed) | |
52 selected_seq_id = set() | |
53 cummulative_length = 0 | |
54 for seq_id in random.sample(seq_summary.id_length.keys(), | |
55 seq_summary.number_of_sequence): | |
56 selected_seq_id.add(seq_id) | |
57 cummulative_length += seq_summary.id_length[seq_id] | |
58 if cummulative_length >= args.total_length: | |
59 break | |
60 print(selected_seq_id) | |
61 # to make it efficient same orger is necesary | |
62 selected_seq_id_ordered = [i | |
63 for i in seq_summary.id_length | |
64 if i in selected_seq_id] | |
65 curent_id = selected_seq_id_ordered.pop(0) | |
66 with open(args.output.name, 'w') as output_seq_file: | |
67 for seqs in SeqIO.parse(args.input.name, "fasta"): | |
68 if seqs.id == curent_id: | |
69 SeqIO.write(seqs, output_seq_file, "fasta") | |
70 try: | |
71 curent_id = selected_seq_id_ordered.pop(0) | |
72 except IndexError: | |
73 break | |
74 | |
75 | |
76 def main(): | |
77 ''' | |
78 Create sample of longn reads, sample size is defined as total length | |
79 ''' | |
80 args = get_args() | |
81 seq_summary = get_sequences_summary(args.input) | |
82 # check that total length if larger than required sampling | |
83 if seq_summary.total_length < args.total_length: | |
84 print("Input sequences total length: ", | |
85 seq_summary.total_length, | |
86 file=sys.stderr) | |
87 print("Required output total length: ", | |
88 args.total_length, | |
89 file=sys.stderr) | |
90 print("Required sequence length is larger than input data, exiting", | |
91 file=sys.stderr) | |
92 sys.exit(1) | |
93 | |
94 make_sequence_sample(args, seq_summary) | |
95 | |
96 | |
97 if __name__ == "__main__": | |
98 main() |