Mercurial > repos > rplanel > sequence_splitter
comparison sequence-splitter.py @ 1:7b509a1801e4 draft
"planemo upload for repository https://github.com/rplanel/galaxy-tools/tree/master/tools/sequence-splitter commit 85ffc2e3805940c0ebb21d5450b86524bd788d7b"
author | rplanel |
---|---|
date | Tue, 20 Aug 2019 09:46:49 -0400 |
parents | 3e33310a7082 |
children | 6dd4f53b9964 |
comparison
equal
deleted
inserted
replaced
0:3e33310a7082 | 1:7b509a1801e4 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python2 |
2 # coding: utf-8 | 2 # coding: utf-8 |
3 | 3 |
4 import argparse | 4 import argparse |
5 import logging | 5 import logging |
6 import os | 6 import os |
22 # extract the filename (with no extension) | 22 # extract the filename (with no extension) |
23 _, filename = os.path.split(basename) | 23 _, filename = os.path.split(basename) |
24 chunk_size = args.chunk_size | 24 chunk_size = args.chunk_size |
25 # split the sequences in chunks | 25 # split the sequences in chunks |
26 if chunk_size: | 26 if chunk_size: |
27 logger.info("%s = %s", "chunk size parameter", chunk_size) | 27 logger.info( |
28 "%s = %s", "Number of sequences per chunk parameter", chunk_size) | |
28 sequences_record = gen_sequence_record(args.sequences, args.format) | 29 sequences_record = gen_sequence_record(args.sequences, args.format) |
29 chunks = gen_get_chunks_by_size(sequences_record, chunk_size) | 30 chunks = gen_chunks_of_size(sequences_record, chunk_size) |
30 else: | 31 else: |
31 logger.info("%s = %s", "number of chunks parameter", args.nb_chunk) | 32 logger.info("%s = %s", "number of chunks parameter", args.nb_chunk) |
32 chunks = gen_get_chunks(args.sequences, args.format, args.nb_chunk) | 33 chunks = gen_chunks(args.sequences, args.format, args.nb_chunk) |
33 | 34 |
34 # Write the chunks in numbered files. | 35 # Write the chunks in numbered files. |
35 write_chunks(chunks, args.output, filename, file_extension, args.format) | 36 write_chunks(chunks, args.output, filename, file_extension, args.format) |
36 | 37 |
37 | 38 |
38 def gen_get_chunks(sequences_path, sequences_format, nb_chunk): | 39 def gen_chunks(sequences_path, sequences_format, nb_chunk): |
39 """[summary] | 40 """[summary] |
40 | 41 Split sequences to have a number of chunks defined by nb_chunks |
41 Arguments: | 42 Arguments: |
42 sequences_path {[type]} -- [description] | 43 sequences_path {[type]} -- Path to the sequence file |
43 sequences_format {[type]} -- [description] | 44 sequences_format {[type]} -- Format the sequence |
44 nb_chunk {[type]} -- [description] | 45 nb_chunk {int} -- Number of chunks we want to obtain |
45 | 46 |
46 Returns: | 47 Returns: |
47 [type] -- [description] | 48 [type] -- [description] |
48 """ | 49 """ |
49 # First record to count the sequences | 50 # First record to count the sequences |
50 sequences_record_to_count = gen_sequence_record( | 51 sequences_record_to_count = gen_sequence_record( |
51 sequences_path, sequences_format) | 52 sequences_path, sequences_format) |
52 # Get the number of sequences | 53 # Get the number of sequences |
53 nb_sequences = get_nb_sequences(sequences_record_to_count) | 54 nb_sequences = get_nb_sequences(sequences_record_to_count) |
54 logger.info("%s = %i", "Number of sequences per chunk", nb_sequences) | 55 logger.info("%s = %i", "Number of sequences total", nb_sequences) |
55 # Second record to that will be splitted | 56 # Second record to that will be splitted |
56 sequences_to_split = gen_sequence_record(sequences_path, sequences_format) | 57 sequences_to_split = gen_sequence_record(sequences_path, sequences_format) |
57 | 58 |
58 # Get the size of the chunks | 59 # Get the size of the chunks |
59 chunk_size = int(nb_sequences / nb_chunk) if nb_sequences > nb_chunk else 1 | 60 chunk_size = int(nb_sequences / nb_chunk) if nb_sequences > nb_chunk else 1 |
60 return gen_get_chunks_by_size(sequences_to_split, chunk_size) | 61 return gen_chunks_of_size( |
62 sequences_to_split, | |
63 chunk_size, | |
64 nb_chunk, | |
65 nb_sequences | |
66 ) | |
61 | 67 |
62 | 68 |
63 def gen_get_chunks_by_size(iterable, size=10): | 69 def gen_chunks_of_size( |
64 logger.info( | 70 iterable, size=10, |
65 "%s = %i", | 71 limit_nb_chunk=None, |
66 "chunk size got (could be different from parameter if more chunk asked \ | 72 nb_sequences=None |
67 than sequences in multifasta)", | 73 ): |
68 size | 74 """[summary] |
69 ) | 75 Split sequences by size |
76 Arguments: | |
77 iterable {[type]} -- [description] | |
78 | |
79 Keyword Arguments: | |
80 size {int} -- [description] (default: {10}) | |
81 """ | |
70 iterator = iter(iterable) | 82 iterator = iter(iterable) |
71 for first in iterator: | 83 if limit_nb_chunk is not None and nb_sequences is not None: |
72 yield chain([first], islice(iterator, size - 1)) | 84 nb_chunk_left = limit_nb_chunk |
85 nb_sequences_left = nb_sequences | |
86 for first in iterator: | |
87 if (size + 1) * nb_chunk_left > nb_sequences_left: | |
88 nb_sequences_left -= size | |
89 nb_chunk_left -= 1 | |
90 yield chain([first], islice(iterator, size - 1)) | |
91 else: | |
92 nb_chunk_left -= 1 | |
93 size += 1 | |
94 nb_sequences_left -= size | |
95 yield chain([first], islice(iterator, size - 1)) | |
96 else: | |
97 for first in iterator: | |
98 yield chain([first], islice(iterator, size - 1)) | |
73 | 99 |
74 | 100 |
75 def gen_sequence_record(sequences_path, sequence_format): | 101 def gen_sequence_record(sequences_path, sequence_format): |
76 return SeqIO.parse(sequences_path, sequence_format) | 102 return SeqIO.parse(sequences_path, sequence_format) |
77 | 103 |
78 | 104 |
79 def get_nb_sequences(sequences): | 105 def get_nb_sequences(sequences): |
106 """[summary] | |
107 Compute the number of sequences | |
108 Arguments: | |
109 sequences {[type]} -- Iterable of sequences | |
110 | |
111 Returns: | |
112 [type] -- Number of sequences | |
113 """ | |
80 return sum(1 for _ in sequences) | 114 return sum(1 for _ in sequences) |
81 | 115 |
82 | 116 |
83 def write_chunks(iterable, dirname, filename, file_extension, sequence_format): | 117 def write_chunks(iterable, dirname, filename, file_extension, sequence_format): |
118 sequence_total = 0 | |
84 for idx, chunk in enumerate(iterable): | 119 for idx, chunk in enumerate(iterable): |
85 if not os.path.exists(dirname): | 120 if not os.path.exists(dirname): |
86 os.mkdir(dirname) | 121 os.mkdir(dirname) |
87 output_file = os.path.join( | 122 output_file = os.path.join( |
88 dirname, filename + "-chunk-" + str(idx + 1) + file_extension | 123 dirname, filename + "-chunk-" + str(idx + 1) + file_extension |
89 ) | 124 ) |
125 | |
90 with open(output_file, mode="w") as output_handle: | 126 with open(output_file, mode="w") as output_handle: |
91 count_seq, seq_to_write = tee(chunk, 2) | 127 count_seq, seq_to_write = tee(chunk, 2) |
92 logger.info( | 128 nb_seq = len(list(count_seq)) |
93 "%s : number of seuquences = %i", output_file, len( | 129 logger.info("%s : number of sequences = %i", output_file, nb_seq) |
94 list(count_seq)) | 130 sequence_total += nb_seq |
95 ) | |
96 SeqIO.write(seq_to_write, output_handle, sequence_format) | 131 SeqIO.write(seq_to_write, output_handle, sequence_format) |
132 logger.info("%s = %i", "Total number of chunks", idx + 1) | |
133 logger.info("%s = %i", "Number of sequences total", sequence_total) | |
134 | |
135 | |
136 def positive_integer(str_value): | |
137 """[summary] | |
138 Define a type for argparse in order to enforce integer greater than 0 | |
139 Arguments: | |
140 str_value {[type]} -- Value got by argparse | |
141 | |
142 Raises: | |
143 argparse.ArgumentTypeError: When the value is not an integer > 0 | |
144 | |
145 Returns: | |
146 [type] -- [description] | |
147 """ | |
148 value = int(str_value) | |
149 if isinstance(value, int) and value > 0: | |
150 return value | |
151 else: | |
152 msg = "%r is not an integer > 0" % value | |
153 raise argparse.ArgumentTypeError(msg) | |
97 | 154 |
98 | 155 |
99 def parse_arguments(): | 156 def parse_arguments(): |
100 parser = argparse.ArgumentParser(description="Split fasta/fastq files") | 157 parser = argparse.ArgumentParser(description="Split fasta/fastq files") |
101 parser.add_argument( | 158 parser.add_argument( |
104 | 161 |
105 parser.add_argument("-f", "--format", type=str, | 162 parser.add_argument("-f", "--format", type=str, |
106 help="File format (fastq, fasta)") | 163 help="File format (fastq, fasta)") |
107 group = parser.add_mutually_exclusive_group(required=True) | 164 group = parser.add_mutually_exclusive_group(required=True) |
108 group.add_argument( | 165 group.add_argument( |
109 "-c", "--chunk-size", | 166 "-c", "--chunk-size", type=positive_integer, |
110 type=int, | |
111 help="The number of sequences by chunks." | 167 help="The number of sequences by chunks." |
112 ) | 168 ) |
113 group.add_argument("-n", "--nb-chunk", help="Number of chunks", type=int) | 169 group.add_argument( |
170 "-n", "--nb-chunk", | |
171 type=positive_integer, | |
172 help="Number of chunks" | |
173 ) | |
114 | 174 |
115 parser.add_argument( | 175 parser.add_argument( |
116 "-o", | 176 "-o", |
117 "--output", | 177 "--output", |
118 type=str, | 178 type=str, |