comparison sequence-splitter.py @ 1:7b509a1801e4 draft

"planemo upload for repository https://github.com/rplanel/galaxy-tools/tree/master/tools/sequence-splitter commit 85ffc2e3805940c0ebb21d5450b86524bd788d7b"
author rplanel
date Tue, 20 Aug 2019 09:46:49 -0400
parents 3e33310a7082
children 6dd4f53b9964
comparison
equal deleted inserted replaced
0:3e33310a7082 1:7b509a1801e4
1 #!/usr/bin/env python3 1 #!/usr/bin/env python2
2 # coding: utf-8 2 # coding: utf-8
3 3
4 import argparse 4 import argparse
5 import logging 5 import logging
6 import os 6 import os
22 # extract the filename (with no extension) 22 # extract the filename (with no extension)
23 _, filename = os.path.split(basename) 23 _, filename = os.path.split(basename)
24 chunk_size = args.chunk_size 24 chunk_size = args.chunk_size
25 # split the sequences in chunks 25 # split the sequences in chunks
26 if chunk_size: 26 if chunk_size:
27 logger.info("%s = %s", "chunk size parameter", chunk_size) 27 logger.info(
28 "%s = %s", "Number of sequences per chunk parameter", chunk_size)
28 sequences_record = gen_sequence_record(args.sequences, args.format) 29 sequences_record = gen_sequence_record(args.sequences, args.format)
29 chunks = gen_get_chunks_by_size(sequences_record, chunk_size) 30 chunks = gen_chunks_of_size(sequences_record, chunk_size)
30 else: 31 else:
31 logger.info("%s = %s", "number of chunks parameter", args.nb_chunk) 32 logger.info("%s = %s", "number of chunks parameter", args.nb_chunk)
32 chunks = gen_get_chunks(args.sequences, args.format, args.nb_chunk) 33 chunks = gen_chunks(args.sequences, args.format, args.nb_chunk)
33 34
34 # Write the chunks in numbered files. 35 # Write the chunks in numbered files.
35 write_chunks(chunks, args.output, filename, file_extension, args.format) 36 write_chunks(chunks, args.output, filename, file_extension, args.format)
36 37
37 38
38 def gen_get_chunks(sequences_path, sequences_format, nb_chunk): 39 def gen_chunks(sequences_path, sequences_format, nb_chunk):
39 """[summary] 40 """[summary]
40 41 Split sequences to have a number of chunks defined by nb_chunks
41 Arguments: 42 Arguments:
42 sequences_path {[type]} -- [description] 43 sequences_path {[type]} -- Path to the sequence file
43 sequences_format {[type]} -- [description] 44 sequences_format {[type]} -- Format the sequence
44 nb_chunk {[type]} -- [description] 45 nb_chunk {int} -- Number of chunks we want to obtain
45 46
46 Returns: 47 Returns:
47 [type] -- [description] 48 [type] -- [description]
48 """ 49 """
49 # First record to count the sequences 50 # First record to count the sequences
50 sequences_record_to_count = gen_sequence_record( 51 sequences_record_to_count = gen_sequence_record(
51 sequences_path, sequences_format) 52 sequences_path, sequences_format)
52 # Get the number of sequences 53 # Get the number of sequences
53 nb_sequences = get_nb_sequences(sequences_record_to_count) 54 nb_sequences = get_nb_sequences(sequences_record_to_count)
54 logger.info("%s = %i", "Number of sequences per chunk", nb_sequences) 55 logger.info("%s = %i", "Number of sequences total", nb_sequences)
55 # Second record to that will be splitted 56 # Second record to that will be splitted
56 sequences_to_split = gen_sequence_record(sequences_path, sequences_format) 57 sequences_to_split = gen_sequence_record(sequences_path, sequences_format)
57 58
58 # Get the size of the chunks 59 # Get the size of the chunks
59 chunk_size = int(nb_sequences / nb_chunk) if nb_sequences > nb_chunk else 1 60 chunk_size = int(nb_sequences / nb_chunk) if nb_sequences > nb_chunk else 1
60 return gen_get_chunks_by_size(sequences_to_split, chunk_size) 61 return gen_chunks_of_size(
62 sequences_to_split,
63 chunk_size,
64 nb_chunk,
65 nb_sequences
66 )
61 67
62 68
63 def gen_get_chunks_by_size(iterable, size=10): 69 def gen_chunks_of_size(
64 logger.info( 70 iterable, size=10,
65 "%s = %i", 71 limit_nb_chunk=None,
66 "chunk size got (could be different from parameter if more chunk asked \ 72 nb_sequences=None
67 than sequences in multifasta)", 73 ):
68 size 74 """[summary]
69 ) 75 Split sequences by size
76 Arguments:
77 iterable {[type]} -- [description]
78
79 Keyword Arguments:
80 size {int} -- [description] (default: {10})
81 """
70 iterator = iter(iterable) 82 iterator = iter(iterable)
71 for first in iterator: 83 if limit_nb_chunk is not None and nb_sequences is not None:
72 yield chain([first], islice(iterator, size - 1)) 84 nb_chunk_left = limit_nb_chunk
85 nb_sequences_left = nb_sequences
86 for first in iterator:
87 if (size + 1) * nb_chunk_left > nb_sequences_left:
88 nb_sequences_left -= size
89 nb_chunk_left -= 1
90 yield chain([first], islice(iterator, size - 1))
91 else:
92 nb_chunk_left -= 1
93 size += 1
94 nb_sequences_left -= size
95 yield chain([first], islice(iterator, size - 1))
96 else:
97 for first in iterator:
98 yield chain([first], islice(iterator, size - 1))
73 99
74 100
75 def gen_sequence_record(sequences_path, sequence_format): 101 def gen_sequence_record(sequences_path, sequence_format):
76 return SeqIO.parse(sequences_path, sequence_format) 102 return SeqIO.parse(sequences_path, sequence_format)
77 103
78 104
79 def get_nb_sequences(sequences): 105 def get_nb_sequences(sequences):
106 """[summary]
107 Compute the number of sequences
108 Arguments:
109 sequences {[type]} -- Iterable of sequences
110
111 Returns:
112 [type] -- Number of sequences
113 """
80 return sum(1 for _ in sequences) 114 return sum(1 for _ in sequences)
81 115
82 116
83 def write_chunks(iterable, dirname, filename, file_extension, sequence_format): 117 def write_chunks(iterable, dirname, filename, file_extension, sequence_format):
118 sequence_total = 0
84 for idx, chunk in enumerate(iterable): 119 for idx, chunk in enumerate(iterable):
85 if not os.path.exists(dirname): 120 if not os.path.exists(dirname):
86 os.mkdir(dirname) 121 os.mkdir(dirname)
87 output_file = os.path.join( 122 output_file = os.path.join(
88 dirname, filename + "-chunk-" + str(idx + 1) + file_extension 123 dirname, filename + "-chunk-" + str(idx + 1) + file_extension
89 ) 124 )
125
90 with open(output_file, mode="w") as output_handle: 126 with open(output_file, mode="w") as output_handle:
91 count_seq, seq_to_write = tee(chunk, 2) 127 count_seq, seq_to_write = tee(chunk, 2)
92 logger.info( 128 nb_seq = len(list(count_seq))
93 "%s : number of seuquences = %i", output_file, len( 129 logger.info("%s : number of sequences = %i", output_file, nb_seq)
94 list(count_seq)) 130 sequence_total += nb_seq
95 )
96 SeqIO.write(seq_to_write, output_handle, sequence_format) 131 SeqIO.write(seq_to_write, output_handle, sequence_format)
132 logger.info("%s = %i", "Total number of chunks", idx + 1)
133 logger.info("%s = %i", "Number of sequences total", sequence_total)
134
135
136 def positive_integer(str_value):
137 """[summary]
138 Define a type for argparse in order to enforce integer greater than 0
139 Arguments:
140 str_value {[type]} -- Value got by argparse
141
142 Raises:
143 argparse.ArgumentTypeError: When the value is not an integer > 0
144
145 Returns:
146 [type] -- [description]
147 """
148 value = int(str_value)
149 if isinstance(value, int) and value > 0:
150 return value
151 else:
152 msg = "%r is not an integer > 0" % value
153 raise argparse.ArgumentTypeError(msg)
97 154
98 155
99 def parse_arguments(): 156 def parse_arguments():
100 parser = argparse.ArgumentParser(description="Split fasta/fastq files") 157 parser = argparse.ArgumentParser(description="Split fasta/fastq files")
101 parser.add_argument( 158 parser.add_argument(
104 161
105 parser.add_argument("-f", "--format", type=str, 162 parser.add_argument("-f", "--format", type=str,
106 help="File format (fastq, fasta)") 163 help="File format (fastq, fasta)")
107 group = parser.add_mutually_exclusive_group(required=True) 164 group = parser.add_mutually_exclusive_group(required=True)
108 group.add_argument( 165 group.add_argument(
109 "-c", "--chunk-size", 166 "-c", "--chunk-size", type=positive_integer,
110 type=int,
111 help="The number of sequences by chunks." 167 help="The number of sequences by chunks."
112 ) 168 )
113 group.add_argument("-n", "--nb-chunk", help="Number of chunks", type=int) 169 group.add_argument(
170 "-n", "--nb-chunk",
171 type=positive_integer,
172 help="Number of chunks"
173 )
114 174
115 parser.add_argument( 175 parser.add_argument(
116 "-o", 176 "-o",
117 "--output", 177 "--output",
118 type=str, 178 type=str,