annotate long_reads_sampling.py @ 2:ccaedca97e5e draft

planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
author petr-novak
date Fri, 08 Dec 2023 09:17:46 +0000
parents dd46956ff61f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
2 import sys
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
3 import argparse
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
4 import random
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
5 from argparse import ArgumentDefaultsHelpFormatter
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
6 from Bio import SeqIO
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
7 from long2short import get_sequences_summary
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
8
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
9
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
10 def get_args():
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
11 '''Parses command line arguments '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
12 description = ("Create sample of long reads, instead"
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
13 " of setting number of reads to be sampled,"
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
14 "total length of all sampled sequences is defined")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
15
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
16 parser = argparse.ArgumentParser(
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
17 description=description,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
18 formatter_class=ArgumentDefaultsHelpFormatter)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
19 parser.add_argument('-i',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
20 '--input',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
21 type=argparse.FileType('r'),
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
22 help="file with long reads in fasta format")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
23 parser.add_argument('-o',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
24 '--output',
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
25 type=argparse.FileType('w'),
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
26 help="Output file name")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
27 parser.add_argument("-l",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
28 "--total_length",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
29 type=int,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
30 help="total length of sampled output")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
31
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
32 parser.add_argument("-s",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
33 "--seed",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
34 type=int,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
35 default=123,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
36 help="random number generator seed")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
37
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
38 args = parser.parse_args()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
39 if len(sys.argv) == 1:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
40 parser.print_help()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
41 sys.exit(1)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
42
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
43 return args
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
44
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
45
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
46 def make_sequence_sample(args, seq_summary):
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
47 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
48 create output sequence of required length or larger, save to fasta
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
49 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
50 random.seed(args.seed)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
51 print(args.seed)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
52 selected_seq_id = set()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
53 cummulative_length = 0
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
54 for seq_id in random.sample(seq_summary.id_length.keys(),
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
55 seq_summary.number_of_sequence):
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
56 selected_seq_id.add(seq_id)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
57 cummulative_length += seq_summary.id_length[seq_id]
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
58 if cummulative_length >= args.total_length:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
59 break
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
60 # to make it efficient same orger is necesary
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
61 selected_seq_id_ordered = [i
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
62 for i in seq_summary.id_length
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
63 if i in selected_seq_id]
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
64 curent_id = selected_seq_id_ordered.pop(0)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
65 with open(args.output.name, 'w') as output_seq_file:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
66 for seqs in SeqIO.parse(args.input.name, "fasta"):
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
67 if seqs.id == curent_id:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
68 SeqIO.write(seqs, output_seq_file, "fasta")
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
69 try:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
70 curent_id = selected_seq_id_ordered.pop(0)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
71 except IndexError:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
72 break
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
73
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
74
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
75 def main():
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
76 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
77 Create sample of longn reads, sample size is defined as total length
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
78 '''
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
79 args = get_args()
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
80 seq_summary = get_sequences_summary(args.input)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
81 # check that total length if larger than required sampling
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
82 if seq_summary.total_length < args.total_length:
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
83 print("Input sequences total length: ",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
84 seq_summary.total_length,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
85 file=sys.stderr)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
86 print("Required output total length: ",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
87 args.total_length,
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
88 file=sys.stderr)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
89 print("Required sequence length is larger than input data, exiting",
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
90 file=sys.stderr)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
91 sys.exit(1)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
92
2
ccaedca97e5e planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents: 0
diff changeset
93 print("input sequence info")
ccaedca97e5e planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents: 0
diff changeset
94 print("total length: ", seq_summary.total_length)
ccaedca97e5e planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents: 0
diff changeset
95 print("number of sequences: ", seq_summary.number_of_sequence)
0
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
96 make_sequence_sample(args, seq_summary)
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
97
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
98
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
99 if __name__ == "__main__":
dd46956ff61f Uploaded
petr-novak
parents:
diff changeset
100 main()