# HG changeset patch
# User rplanel
# Date 1566308809 14400
# Node ID 7b509a1801e4e61024ea01481d1e49e0e167da1a
# Parent 3e33310a7082c4f54212560379375cc27630b071
"planemo upload for repository https://github.com/rplanel/galaxy-tools/tree/master/tools/sequence-splitter commit 85ffc2e3805940c0ebb21d5450b86524bd788d7b"
diff -r 3e33310a7082 -r 7b509a1801e4 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Tue Aug 20 09:46:49 2019 -0400
@@ -0,0 +1,12 @@
+
+
+
+
+
+ @UNPUBLISHED{None,
+ author = {Rémi Planel},
+ title = {None},
+ }
+
+
+
\ No newline at end of file
diff -r 3e33310a7082 -r 7b509a1801e4 sequence-splitter.py
--- a/sequence-splitter.py Thu Aug 08 11:18:30 2019 -0400
+++ b/sequence-splitter.py Tue Aug 20 09:46:49 2019 -0400
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python2
# coding: utf-8
import argparse
@@ -24,24 +24,25 @@
chunk_size = args.chunk_size
# split the sequences in chunks
if chunk_size:
- logger.info("%s = %s", "chunk size parameter", chunk_size)
+ logger.info(
+ "%s = %s", "Number of sequences per chunk parameter", chunk_size)
sequences_record = gen_sequence_record(args.sequences, args.format)
- chunks = gen_get_chunks_by_size(sequences_record, chunk_size)
+ chunks = gen_chunks_of_size(sequences_record, chunk_size)
else:
logger.info("%s = %s", "number of chunks parameter", args.nb_chunk)
- chunks = gen_get_chunks(args.sequences, args.format, args.nb_chunk)
+ chunks = gen_chunks(args.sequences, args.format, args.nb_chunk)
# Write the chunks in numbered files.
write_chunks(chunks, args.output, filename, file_extension, args.format)
-def gen_get_chunks(sequences_path, sequences_format, nb_chunk):
+def gen_chunks(sequences_path, sequences_format, nb_chunk):
"""[summary]
-
+ Split sequences to have a number of chunks defined by nb_chunks
Arguments:
- sequences_path {[type]} -- [description]
- sequences_format {[type]} -- [description]
- nb_chunk {[type]} -- [description]
+ sequences_path {[type]} -- Path to the sequence file
+ sequences_format {[type]} -- Format the sequence
+ nb_chunk {int} -- Number of chunks we want to obtain
Returns:
[type] -- [description]
@@ -51,25 +52,50 @@
sequences_path, sequences_format)
# Get the number of sequences
nb_sequences = get_nb_sequences(sequences_record_to_count)
- logger.info("%s = %i", "Number of sequences per chunk", nb_sequences)
+ logger.info("%s = %i", "Number of sequences total", nb_sequences)
# Second record to that will be splitted
sequences_to_split = gen_sequence_record(sequences_path, sequences_format)
# Get the size of the chunks
chunk_size = int(nb_sequences / nb_chunk) if nb_sequences > nb_chunk else 1
- return gen_get_chunks_by_size(sequences_to_split, chunk_size)
+ return gen_chunks_of_size(
+ sequences_to_split,
+ chunk_size,
+ nb_chunk,
+ nb_sequences
+ )
-def gen_get_chunks_by_size(iterable, size=10):
- logger.info(
- "%s = %i",
- "chunk size got (could be different from parameter if more chunk asked \
- than sequences in multifasta)",
- size
- )
+def gen_chunks_of_size(
+ iterable, size=10,
+ limit_nb_chunk=None,
+ nb_sequences=None
+):
+ """[summary]
+ Split sequences by size
+ Arguments:
+ iterable {[type]} -- [description]
+
+ Keyword Arguments:
+ size {int} -- [description] (default: {10})
+ """
iterator = iter(iterable)
- for first in iterator:
- yield chain([first], islice(iterator, size - 1))
+ if limit_nb_chunk is not None and nb_sequences is not None:
+ nb_chunk_left = limit_nb_chunk
+ nb_sequences_left = nb_sequences
+ for first in iterator:
+ if (size + 1) * nb_chunk_left > nb_sequences_left:
+ nb_sequences_left -= size
+ nb_chunk_left -= 1
+ yield chain([first], islice(iterator, size - 1))
+ else:
+ nb_chunk_left -= 1
+ size += 1
+ nb_sequences_left -= size
+ yield chain([first], islice(iterator, size - 1))
+ else:
+ for first in iterator:
+ yield chain([first], islice(iterator, size - 1))
def gen_sequence_record(sequences_path, sequence_format):
@@ -77,23 +103,54 @@
def get_nb_sequences(sequences):
+ """[summary]
+ Compute the number of sequences
+ Arguments:
+ sequences {[type]} -- Iterable of sequences
+
+ Returns:
+ [type] -- Number of sequences
+ """
return sum(1 for _ in sequences)
def write_chunks(iterable, dirname, filename, file_extension, sequence_format):
+ sequence_total = 0
for idx, chunk in enumerate(iterable):
if not os.path.exists(dirname):
os.mkdir(dirname)
output_file = os.path.join(
dirname, filename + "-chunk-" + str(idx + 1) + file_extension
)
+
with open(output_file, mode="w") as output_handle:
count_seq, seq_to_write = tee(chunk, 2)
- logger.info(
- "%s : number of seuquences = %i", output_file, len(
- list(count_seq))
- )
+ nb_seq = len(list(count_seq))
+ logger.info("%s : number of sequences = %i", output_file, nb_seq)
+ sequence_total += nb_seq
SeqIO.write(seq_to_write, output_handle, sequence_format)
+ logger.info("%s = %i", "Total number of chunks", idx + 1)
+ logger.info("%s = %i", "Number of sequences total", sequence_total)
+
+
+def positive_integer(str_value):
+ """[summary]
+ Define a type for argparse in order to enforce integer greater than 0
+ Arguments:
+ str_value {[type]} -- Value got by argparse
+
+ Raises:
+ argparse.ArgumentTypeError: When the value is not an integer > 0
+
+ Returns:
+ [type] -- [description]
+ """
+ value = int(str_value)
+ if isinstance(value, int) and value > 0:
+ return value
+ else:
+ msg = "%r is not an integer > 0" % value
+ raise argparse.ArgumentTypeError(msg)
def parse_arguments():
@@ -106,11 +163,14 @@
help="File format (fastq, fasta)")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
- "-c", "--chunk-size",
- type=int,
+ "-c", "--chunk-size", type=positive_integer,
help="The number of sequences by chunks."
)
- group.add_argument("-n", "--nb-chunk", help="Number of chunks", type=int)
+ group.add_argument(
+ "-n", "--nb-chunk",
+ type=positive_integer,
+ help="Number of chunks"
+ )
parser.add_argument(
"-o",
diff -r 3e33310a7082 -r 7b509a1801e4 sequence-splitter.xml
--- a/sequence-splitter.xml Thu Aug 08 11:18:30 2019 -0400
+++ b/sequence-splitter.xml Tue Aug 20 09:46:49 2019 -0400
@@ -1,4 +1,7 @@
-
+
+
+ macros.xml
+
python
biopython
@@ -50,17 +53,7 @@
-
-
-
-
-
-
-
-
-
-
-
+
@@ -68,14 +61,24 @@
-
-
-
-
-
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
@@ -83,8 +86,21 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -94,10 +110,10 @@
-
-
-
-
+
+
+
+
@@ -129,4 +145,5 @@
]]>
+
\ No newline at end of file
diff -r 3e33310a7082 -r 7b509a1801e4 test-data/sample-4-chunk-1.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample-4-chunk-1.fastq Tue Aug 20 09:46:49 2019 -0400
@@ -0,0 +1,8 @@
+@cluster_2:UMI_ATTCCG
+TTTCCGGGGCACATAATCTTCAGCCGGGCGC
++
+9C;=;=<9@4868>9:67AA<9>65<=>591
+@cluster_8:UMI_CTTTGA
+TATCCTTGCAATACTCTCCGAACGGGAGAGC
++
+1/04.72,(003,-2-22+00-12./.-.4-
diff -r 3e33310a7082 -r 7b509a1801e4 test-data/sample-4-chunk-2.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample-4-chunk-2.fastq Tue Aug 20 09:46:49 2019 -0400
@@ -0,0 +1,8 @@
+@cluster_12:UMI_GGTCAA
+GCAGTTTAAGATCATTTTATTGAAGAGCAAG
++
+?7?AEEC@>=1?A?EEEB9ECB?==:B.A?A
+@cluster_21:UMI_AGAACA
+GGCATTGCAAAATTTATTACACCCCCAGATC
++
+>=2.660/?:36AD;0<14703640334-//
diff -r 3e33310a7082 -r 7b509a1801e4 test-data/sample-4-chunk-3.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample-4-chunk-3.fastq Tue Aug 20 09:46:49 2019 -0400
@@ -0,0 +1,12 @@
+@cluster_29:UMI_GCAGGA
+CCCCCTTAAATAGCTGTTTATTTGGCCCCAG
++
+8;;;>DC@DAC=B?C@9?B?CDCB@><?A
+@cluster_34:UMI_AGCTCA
+TCTTGCAAAAACTCCTAGATCGGAAGAGCAC
++
+-/CA:+<599803./2065?6=<>90;?150
+@cluster_36:UMI_AACAGA
+TCCCCCCCCCAAATCGGAAAAACACACCCCC
++
+5?:5;<02:@977=:<0=9>@5>7>;>*3,-
diff -r 3e33310a7082 -r 7b509a1801e4 test-data/sample-4-chunk-4.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample-4-chunk-4.fastq Tue Aug 20 09:46:49 2019 -0400
@@ -0,0 +1,12 @@
+@cluster_37:UMI_GAGGAG
+GTCTTTGTACAAAATTTTATTAAAGGTCTTT
++
+?B?DEC@A=?ADDAEEEC?EC@D6A@@>DE4
+@cluster_39:UMI_GAACCG
+CCTTCCATCACCAGATCGGAAAAACACACGC
++
+00>7;8@5<192?/8;0;;>=3=/3239713
+@cluster_43:UMI_GGATTG
+GAGTTATAATCCAATCTTTATTTAAAAATCT
++
+>=AEC?C@;??0A>?0DEB9EEB@DDC1?=6
diff -r 3e33310a7082 -r 7b509a1801e4 test-data/sample-5-chunk-1.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample-5-chunk-1.fasta Tue Aug 20 09:46:49 2019 -0400
@@ -0,0 +1,8 @@
+>derice
+ACTGACTAGCTAGCTAACTG
+>sanka
+GCATCGTAGCTAGCTACGAT
+>junior
+CATCGATCGTACGTACGTAG
+>yul
+ATCGATCGATCGTACGATCG
diff -r 3e33310a7082 -r 7b509a1801e4 test-data/sample-chunk-1.fasta
--- a/test-data/sample-chunk-1.fasta Thu Aug 08 11:18:30 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->derice
-ACTGACTAGCTAGCTAACTG
->sanka
-GCATCGTAGCTAGCTACGAT