Mercurial > repos > rplanel > sequence_splitter
changeset 1:7b509a1801e4 draft
"planemo upload for repository https://github.com/rplanel/galaxy-tools/tree/master/tools/sequence-splitter commit 85ffc2e3805940c0ebb21d5450b86524bd788d7b"
author | rplanel |
---|---|
date | Tue, 20 Aug 2019 09:46:49 -0400 |
parents | 3e33310a7082 |
children | 6dd4f53b9964 |
files | macros.xml sequence-splitter.py sequence-splitter.xml test-data/sample-4-chunk-1.fastq test-data/sample-4-chunk-2.fastq test-data/sample-4-chunk-3.fastq test-data/sample-4-chunk-4.fastq test-data/sample-5-chunk-1.fasta test-data/sample-chunk-1.fasta |
diffstat | 9 files changed, 188 insertions(+), 55 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Aug 20 09:46:49 2019 -0400 @@ -0,0 +1,12 @@ +<?xml version='1.0' encoding='UTF-8'?> +<macros> + <xml name="citations"> + <citations> + <citation type="bibtex"> + @UNPUBLISHED{None, + author = {Rémi Planel}, + title = {None}, + }</citation> + </citations> + </xml> +</macros> \ No newline at end of file
--- a/sequence-splitter.py Thu Aug 08 11:18:30 2019 -0400 +++ b/sequence-splitter.py Tue Aug 20 09:46:49 2019 -0400 @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python2 # coding: utf-8 import argparse @@ -24,24 +24,25 @@ chunk_size = args.chunk_size # split the sequences in chunks if chunk_size: - logger.info("%s = %s", "chunk size parameter", chunk_size) + logger.info( + "%s = %s", "Number of sequences per chunk parameter", chunk_size) sequences_record = gen_sequence_record(args.sequences, args.format) - chunks = gen_get_chunks_by_size(sequences_record, chunk_size) + chunks = gen_chunks_of_size(sequences_record, chunk_size) else: logger.info("%s = %s", "number of chunks parameter", args.nb_chunk) - chunks = gen_get_chunks(args.sequences, args.format, args.nb_chunk) + chunks = gen_chunks(args.sequences, args.format, args.nb_chunk) # Write the chunks in numbered files. write_chunks(chunks, args.output, filename, file_extension, args.format) -def gen_get_chunks(sequences_path, sequences_format, nb_chunk): +def gen_chunks(sequences_path, sequences_format, nb_chunk): """[summary] - + Split sequences to have a number of chunks defined by nb_chunks Arguments: - sequences_path {[type]} -- [description] - sequences_format {[type]} -- [description] - nb_chunk {[type]} -- [description] + sequences_path {[type]} -- Path to the sequence file + sequences_format {[type]} -- Format the sequence + nb_chunk {int} -- Number of chunks we want to obtain Returns: [type] -- [description] @@ -51,25 +52,50 @@ sequences_path, sequences_format) # Get the number of sequences nb_sequences = get_nb_sequences(sequences_record_to_count) - logger.info("%s = %i", "Number of sequences per chunk", nb_sequences) + logger.info("%s = %i", "Number of sequences total", nb_sequences) # Second record to that will be splitted sequences_to_split = gen_sequence_record(sequences_path, sequences_format) # Get the size of the chunks chunk_size = int(nb_sequences / nb_chunk) if nb_sequences > nb_chunk else 1 - return gen_get_chunks_by_size(sequences_to_split, chunk_size) + return gen_chunks_of_size( + sequences_to_split, + chunk_size, + nb_chunk, + nb_sequences + ) -def gen_get_chunks_by_size(iterable, size=10): - logger.info( - "%s = %i", - "chunk size got (could be different from parameter if more chunk asked \ - than sequences in multifasta)", - size - ) +def gen_chunks_of_size( + iterable, size=10, + limit_nb_chunk=None, + nb_sequences=None +): + """[summary] + Split sequences by size + Arguments: + iterable {[type]} -- [description] + + Keyword Arguments: + size {int} -- [description] (default: {10}) + """ iterator = iter(iterable) - for first in iterator: - yield chain([first], islice(iterator, size - 1)) + if limit_nb_chunk is not None and nb_sequences is not None: + nb_chunk_left = limit_nb_chunk + nb_sequences_left = nb_sequences + for first in iterator: + if (size + 1) * nb_chunk_left > nb_sequences_left: + nb_sequences_left -= size + nb_chunk_left -= 1 + yield chain([first], islice(iterator, size - 1)) + else: + nb_chunk_left -= 1 + size += 1 + nb_sequences_left -= size + yield chain([first], islice(iterator, size - 1)) + else: + for first in iterator: + yield chain([first], islice(iterator, size - 1)) def gen_sequence_record(sequences_path, sequence_format): @@ -77,23 +103,54 @@ def get_nb_sequences(sequences): + """[summary] + Compute the number of sequences + Arguments: + sequences {[type]} -- Iterable of sequences + + Returns: + [type] -- Number of sequences + """ return sum(1 for _ in sequences) def write_chunks(iterable, dirname, filename, file_extension, sequence_format): + sequence_total = 0 for idx, chunk in enumerate(iterable): if not os.path.exists(dirname): os.mkdir(dirname) output_file = os.path.join( dirname, filename + "-chunk-" + str(idx + 1) + file_extension ) + with open(output_file, mode="w") as output_handle: count_seq, seq_to_write = tee(chunk, 2) - logger.info( - "%s : number of seuquences = %i", output_file, len( - list(count_seq)) - ) + nb_seq = len(list(count_seq)) + logger.info("%s : number of sequences = %i", output_file, nb_seq) + sequence_total += nb_seq SeqIO.write(seq_to_write, output_handle, sequence_format) + logger.info("%s = %i", "Total number of chunks", idx + 1) + logger.info("%s = %i", "Number of sequences total", sequence_total) + + +def positive_integer(str_value): + """[summary] + Define a type for argparse in order to enforce integer greater than 0 + Arguments: + str_value {[type]} -- Value got by argparse + + Raises: + argparse.ArgumentTypeError: When the value is not an integer > 0 + + Returns: + [type] -- [description] + """ + value = int(str_value) + if isinstance(value, int) and value > 0: + return value + else: + msg = "%r is not an integer > 0" % value + raise argparse.ArgumentTypeError(msg) def parse_arguments(): @@ -106,11 +163,14 @@ help="File format (fastq, fasta)") group = parser.add_mutually_exclusive_group(required=True) group.add_argument( - "-c", "--chunk-size", - type=int, + "-c", "--chunk-size", type=positive_integer, help="The number of sequences by chunks." ) - group.add_argument("-n", "--nb-chunk", help="Number of chunks", type=int) + group.add_argument( + "-n", "--nb-chunk", + type=positive_integer, + help="Number of chunks" + ) parser.add_argument( "-o",
--- a/sequence-splitter.xml Thu Aug 08 11:18:30 2019 -0400 +++ b/sequence-splitter.xml Tue Aug 20 09:46:49 2019 -0400 @@ -1,4 +1,7 @@ -<tool id="sequence-splitter" name="Sequence Splitter" version="0.1.0"> +<tool id="sequence-splitter" name="Sequence Splitter" version="0.1.1"> + <macros> + <import>macros.xml</import> + </macros> <requirements> <requirement type="package" version="2.7.11">python</requirement> <requirement type="package" version="1.73">biopython</requirement> @@ -50,17 +53,7 @@ <data format="text" name="logfile" from_work_dir="sequence-splitter.log" label="${tool.name} logfile on ${on_string}"/> </outputs> <tests> - <test> - <param name="sequences" value="sample.fasta"/> - <conditional name="split_mode"> - <param name="selector" value="chunk_size"/> - <param name="chunk_size" value="2"/> - </conditional> - <output_collection name="split_output" type="list" count="2"> - <element name="sequences-chunk-1" file="sample-chunk-1.fasta" ftype="fasta"/> - <element name="sequences-chunk-2" file="sample-chunk-2.fasta" ftype="fasta"/> - </output_collection> - </test> + <!-- CHUNK SIZE --> <test> <param name="sequences" value="sample.fastq"/> <conditional name="split_mode"> @@ -68,14 +61,24 @@ <param name="chunk_size" value="2"/> </conditional> <output_collection name="split_output" type="list" count="5"> - <element name="sequences-chunk-1" file="sample-chunk-1.fastq" ftype="fastqsanger"/> - <element name="sequences-chunk-2" file="sample-chunk-2.fastq" ftype="fastqsanger"/> - <element name="sequences-chunk-3" file="sample-chunk-3.fastq" ftype="fastqsanger"/> - <element name="sequences-chunk-4" file="sample-chunk-4.fastq" ftype="fastqsanger"/> - <element name="sequences-chunk-5" file="sample-chunk-5.fastq" ftype="fastqsanger"/> + <element name="sequences-chunk-1" file="sample-chunk-1.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-2" file="sample-chunk-2.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-3" file="sample-chunk-3.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-4" file="sample-chunk-4.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-5" file="sample-chunk-5.fastq" ftype="fastqsanger" compare="diff"/> </output_collection> </test> - + <test> + <param name="sequences" value="sample.fasta"/> + <conditional name="split_mode"> + <param name="selector" value="chunk_size" /> + <param name="chunk_size" value="5"/> + </conditional> + <output_collection name="split_output" type="list" count="1"> + <element name="sequences-chunk-1" file="sample-5-chunk-1.fasta" ftype="fasta" compare="diff"/> + </output_collection> + </test> + <!-- NUMBER CHUNK --> <test> <param name="sequences" value="sample.fastq"/> <conditional name="split_mode"> @@ -83,8 +86,21 @@ <param name="nb_chunk" value="2"/> </conditional> <output_collection name="split_output" type="list" count="2"> - <element name="sequences-chunk-1" file="sample-3-chunk-1.fastq" ftype="fastqsanger"/> - <element name="sequences-chunk-2" file="sample-3-chunk-2.fastq" ftype="fastqsanger"/> + <element name="sequences-chunk-1" file="sample-3-chunk-1.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-2" file="sample-3-chunk-2.fastq" ftype="fastqsanger" compare="diff"/> + </output_collection> + </test> + <test> + <param name="sequences" value="sample.fastq"/> + <conditional name="split_mode"> + <param name="selector" value="nb_chunk" /> + <param name="nb_chunk" value="4"/> + </conditional> + <output_collection name="split_output" type="list" count="4"> + <element name="sequences-chunk-1" file="sample-4-chunk-1.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-2" file="sample-4-chunk-2.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-3" file="sample-4-chunk-3.fastq" ftype="fastqsanger" compare="diff"/> + <element name="sequences-chunk-4" file="sample-4-chunk-4.fastq" ftype="fastqsanger" compare="diff"/> </output_collection> </test> <test> @@ -94,10 +110,10 @@ <param name="nb_chunk" value="10"/> </conditional> <output_collection name="split_output" type="list" count="4"> - <element name="sequences-chunk-1" file="sample-2-chunk-1.fasta" ftype="fasta"/> - <element name="sequences-chunk-2" file="sample-2-chunk-2.fasta" ftype="fasta"/> - <element name="sequences-chunk-3" file="sample-2-chunk-3.fasta" ftype="fasta"/> - <element name="sequences-chunk-4" file="sample-2-chunk-4.fasta" ftype="fasta"/> + <element name="sequences-chunk-1" file="sample-2-chunk-1.fasta" ftype="fasta" compare="diff"/> + <element name="sequences-chunk-2" file="sample-2-chunk-2.fasta" ftype="fasta" compare="diff"/> + <element name="sequences-chunk-3" file="sample-2-chunk-3.fasta" ftype="fasta" compare="diff"/> + <element name="sequences-chunk-4" file="sample-2-chunk-4.fasta" ftype="fasta" compare="diff"/> </output_collection> </test> @@ -129,4 +145,5 @@ ]]> </help> + <expand macro="citations" /> </tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-4-chunk-1.fastq Tue Aug 20 09:46:49 2019 -0400 @@ -0,0 +1,8 @@ +@cluster_2:UMI_ATTCCG +TTTCCGGGGCACATAATCTTCAGCCGGGCGC ++ +9C;=;=<9@4868>9:67AA<9>65<=>591 +@cluster_8:UMI_CTTTGA +TATCCTTGCAATACTCTCCGAACGGGAGAGC ++ +1/04.72,(003,-2-22+00-12./.-.4-
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-4-chunk-2.fastq Tue Aug 20 09:46:49 2019 -0400 @@ -0,0 +1,8 @@ +@cluster_12:UMI_GGTCAA +GCAGTTTAAGATCATTTTATTGAAGAGCAAG ++ +?7?AEEC@>=1?A?EEEB9ECB?==:B.A?A +@cluster_21:UMI_AGAACA +GGCATTGCAAAATTTATTACACCCCCAGATC ++ +>=2.660/?:36AD;0<14703640334-//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-4-chunk-3.fastq Tue Aug 20 09:46:49 2019 -0400 @@ -0,0 +1,12 @@ +@cluster_29:UMI_GCAGGA +CCCCCTTAAATAGCTGTTTATTTGGCCCCAG ++ +8;;;>DC@DAC=B?C@9?B?CDCB@><<??A +@cluster_34:UMI_AGCTCA +TCTTGCAAAAACTCCTAGATCGGAAGAGCAC ++ +-/CA:+<599803./2065?6=<>90;?150 +@cluster_36:UMI_AACAGA +TCCCCCCCCCAAATCGGAAAAACACACCCCC ++ +5?:5;<02:@977=:<0=9>@5>7>;>*3,-
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-4-chunk-4.fastq Tue Aug 20 09:46:49 2019 -0400 @@ -0,0 +1,12 @@ +@cluster_37:UMI_GAGGAG +GTCTTTGTACAAAATTTTATTAAAGGTCTTT ++ +?B?DEC@A=?ADDAEEEC?EC@D6A@@>DE4 +@cluster_39:UMI_GAACCG +CCTTCCATCACCAGATCGGAAAAACACACGC ++ +00>7;8@5<192?/8;0;;>=3=/3239713 +@cluster_43:UMI_GGATTG +GAGTTATAATCCAATCTTTATTTAAAAATCT ++ +>=AEC?C@;??0A>?0DEB9EEB@DDC1?=6