Mercurial > repos > petr-novak > long_reads_sampling
annotate long_reads_sampling.py @ 2:ccaedca97e5e draft
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
author | petr-novak |
---|---|
date | Fri, 08 Dec 2023 09:17:46 +0000 |
parents | dd46956ff61f |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python3 |
2 import sys | |
3 import argparse | |
4 import random | |
5 from argparse import ArgumentDefaultsHelpFormatter | |
6 from Bio import SeqIO | |
7 from long2short import get_sequences_summary | |
8 | |
9 | |
10 def get_args(): | |
11 '''Parses command line arguments ''' | |
12 description = ("Create sample of long reads, instead" | |
13 " of setting number of reads to be sampled," | |
14 "total length of all sampled sequences is defined") | |
15 | |
16 parser = argparse.ArgumentParser( | |
17 description=description, | |
18 formatter_class=ArgumentDefaultsHelpFormatter) | |
19 parser.add_argument('-i', | |
20 '--input', | |
21 type=argparse.FileType('r'), | |
22 help="file with long reads in fasta format") | |
23 parser.add_argument('-o', | |
24 '--output', | |
25 type=argparse.FileType('w'), | |
26 help="Output file name") | |
27 parser.add_argument("-l", | |
28 "--total_length", | |
29 type=int, | |
30 help="total length of sampled output") | |
31 | |
32 parser.add_argument("-s", | |
33 "--seed", | |
34 type=int, | |
35 default=123, | |
36 help="random number generator seed") | |
37 | |
38 args = parser.parse_args() | |
39 if len(sys.argv) == 1: | |
40 parser.print_help() | |
41 sys.exit(1) | |
42 | |
43 return args | |
44 | |
45 | |
46 def make_sequence_sample(args, seq_summary): | |
47 ''' | |
48 create output sequence of required length or larger, save to fasta | |
49 ''' | |
50 random.seed(args.seed) | |
51 print(args.seed) | |
52 selected_seq_id = set() | |
53 cummulative_length = 0 | |
54 for seq_id in random.sample(seq_summary.id_length.keys(), | |
55 seq_summary.number_of_sequence): | |
56 selected_seq_id.add(seq_id) | |
57 cummulative_length += seq_summary.id_length[seq_id] | |
58 if cummulative_length >= args.total_length: | |
59 break | |
60 # to make it efficient same orger is necesary | |
61 selected_seq_id_ordered = [i | |
62 for i in seq_summary.id_length | |
63 if i in selected_seq_id] | |
64 curent_id = selected_seq_id_ordered.pop(0) | |
65 with open(args.output.name, 'w') as output_seq_file: | |
66 for seqs in SeqIO.parse(args.input.name, "fasta"): | |
67 if seqs.id == curent_id: | |
68 SeqIO.write(seqs, output_seq_file, "fasta") | |
69 try: | |
70 curent_id = selected_seq_id_ordered.pop(0) | |
71 except IndexError: | |
72 break | |
73 | |
74 | |
75 def main(): | |
76 ''' | |
77 Create sample of longn reads, sample size is defined as total length | |
78 ''' | |
79 args = get_args() | |
80 seq_summary = get_sequences_summary(args.input) | |
81 # check that total length if larger than required sampling | |
82 if seq_summary.total_length < args.total_length: | |
83 print("Input sequences total length: ", | |
84 seq_summary.total_length, | |
85 file=sys.stderr) | |
86 print("Required output total length: ", | |
87 args.total_length, | |
88 file=sys.stderr) | |
89 print("Required sequence length is larger than input data, exiting", | |
90 file=sys.stderr) | |
91 sys.exit(1) | |
92 | |
2
ccaedca97e5e
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents:
0
diff
changeset
|
93 print("input sequence info") |
ccaedca97e5e
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents:
0
diff
changeset
|
94 print("total length: ", seq_summary.total_length) |
ccaedca97e5e
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents:
0
diff
changeset
|
95 print("number of sequences: ", seq_summary.number_of_sequence) |
0 | 96 make_sequence_sample(args, seq_summary) |
97 | |
98 | |
99 if __name__ == "__main__": | |
100 main() |