Mercurial > repos > petr-novak > long_reads_sampling
annotate long_reads_sampling.py @ 4:060fc04af21c draft default tip
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit dcff0045d0b212ebc5c4bef6a824bdbd3bb16ac3
| author | petr-novak |
|---|---|
| date | Fri, 08 Dec 2023 13:11:02 +0000 |
| parents | ccaedca97e5e |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python3 |
| 2 import sys | |
| 3 import argparse | |
| 4 import random | |
| 5 from argparse import ArgumentDefaultsHelpFormatter | |
| 6 from Bio import SeqIO | |
| 7 from long2short import get_sequences_summary | |
| 8 | |
| 9 | |
| 10 def get_args(): | |
| 11 '''Parses command line arguments ''' | |
| 12 description = ("Create sample of long reads, instead" | |
| 13 " of setting number of reads to be sampled," | |
| 14 "total length of all sampled sequences is defined") | |
| 15 | |
| 16 parser = argparse.ArgumentParser( | |
| 17 description=description, | |
| 18 formatter_class=ArgumentDefaultsHelpFormatter) | |
| 19 parser.add_argument('-i', | |
| 20 '--input', | |
| 21 type=argparse.FileType('r'), | |
| 22 help="file with long reads in fasta format") | |
| 23 parser.add_argument('-o', | |
| 24 '--output', | |
| 25 type=argparse.FileType('w'), | |
| 26 help="Output file name") | |
| 27 parser.add_argument("-l", | |
| 28 "--total_length", | |
| 29 type=int, | |
| 30 help="total length of sampled output") | |
| 31 | |
| 32 parser.add_argument("-s", | |
| 33 "--seed", | |
| 34 type=int, | |
| 35 default=123, | |
| 36 help="random number generator seed") | |
| 37 | |
| 38 args = parser.parse_args() | |
| 39 if len(sys.argv) == 1: | |
| 40 parser.print_help() | |
| 41 sys.exit(1) | |
| 42 | |
| 43 return args | |
| 44 | |
| 45 | |
| 46 def make_sequence_sample(args, seq_summary): | |
| 47 ''' | |
| 48 create output sequence of required length or larger, save to fasta | |
| 49 ''' | |
| 50 random.seed(args.seed) | |
| 51 print(args.seed) | |
| 52 selected_seq_id = set() | |
| 53 cummulative_length = 0 | |
| 54 for seq_id in random.sample(seq_summary.id_length.keys(), | |
| 55 seq_summary.number_of_sequence): | |
| 56 selected_seq_id.add(seq_id) | |
| 57 cummulative_length += seq_summary.id_length[seq_id] | |
| 58 if cummulative_length >= args.total_length: | |
| 59 break | |
| 60 # to make it efficient same orger is necesary | |
| 61 selected_seq_id_ordered = [i | |
| 62 for i in seq_summary.id_length | |
| 63 if i in selected_seq_id] | |
| 64 curent_id = selected_seq_id_ordered.pop(0) | |
| 65 with open(args.output.name, 'w') as output_seq_file: | |
| 66 for seqs in SeqIO.parse(args.input.name, "fasta"): | |
| 67 if seqs.id == curent_id: | |
| 68 SeqIO.write(seqs, output_seq_file, "fasta") | |
| 69 try: | |
| 70 curent_id = selected_seq_id_ordered.pop(0) | |
| 71 except IndexError: | |
| 72 break | |
| 73 | |
| 74 | |
| 75 def main(): | |
| 76 ''' | |
| 77 Create sample of longn reads, sample size is defined as total length | |
| 78 ''' | |
| 79 args = get_args() | |
| 80 seq_summary = get_sequences_summary(args.input) | |
| 81 # check that total length if larger than required sampling | |
| 82 if seq_summary.total_length < args.total_length: | |
| 83 print("Input sequences total length: ", | |
| 84 seq_summary.total_length, | |
| 85 file=sys.stderr) | |
| 86 print("Required output total length: ", | |
| 87 args.total_length, | |
| 88 file=sys.stderr) | |
| 89 print("Required sequence length is larger than input data, exiting", | |
| 90 file=sys.stderr) | |
| 91 sys.exit(1) | |
| 92 | |
|
2
ccaedca97e5e
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents:
0
diff
changeset
|
93 print("input sequence info") |
|
ccaedca97e5e
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents:
0
diff
changeset
|
94 print("total length: ", seq_summary.total_length) |
|
ccaedca97e5e
planemo upload for repository https://github.com/kavonrtep/galaxy_tools commit 2b3bc2334397749509cdf6fc432d891a09763c4f-dirty
petr-novak
parents:
0
diff
changeset
|
95 print("number of sequences: ", seq_summary.number_of_sequence) |
| 0 | 96 make_sequence_sample(args, seq_summary) |
| 97 | |
| 98 | |
| 99 if __name__ == "__main__": | |
| 100 main() |
