Mercurial > repos > rplanel > sequence_splitter
changeset 0:3e33310a7082 draft
planemo upload for repository https://github.com/rplanel/galaxy-tools/tree/master/tools/sequence-splitter commit cba12d215a89e9685c7d1f55d067770d7ec0dea2
author | rplanel |
---|---|
date | Thu, 08 Aug 2019 11:18:30 -0400 |
parents | |
children | 7b509a1801e4 |
files | sequence-splitter.py sequence-splitter.xml test-data/sample-2-chunk-1.fasta test-data/sample-2-chunk-2.fasta test-data/sample-2-chunk-3.fasta test-data/sample-2-chunk-4.fasta test-data/sample-3-chunk-1.fastq test-data/sample-3-chunk-2.fastq test-data/sample-chunk-1.fasta test-data/sample-chunk-1.fastq test-data/sample-chunk-2.fasta test-data/sample-chunk-2.fastq test-data/sample-chunk-3.fastq test-data/sample-chunk-4.fastq test-data/sample-chunk-5.fastq test-data/sample.fasta test-data/sample.fastq |
diffstat | 17 files changed, 405 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequence-splitter.py Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +import argparse +import logging +import os +from itertools import chain, islice, tee + +# BioPython +from Bio import SeqIO + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s : %(levelname)s : %(message)s" +) +logger = logging.getLogger() + + +def main(): + args = parse_arguments() + # extract the basename and the file extension + basename, file_extension = os.path.splitext(args.sequences) + # extract the filename (with no extension) + _, filename = os.path.split(basename) + chunk_size = args.chunk_size + # split the sequences in chunks + if chunk_size: + logger.info("%s = %s", "chunk size parameter", chunk_size) + sequences_record = gen_sequence_record(args.sequences, args.format) + chunks = gen_get_chunks_by_size(sequences_record, chunk_size) + else: + logger.info("%s = %s", "number of chunks parameter", args.nb_chunk) + chunks = gen_get_chunks(args.sequences, args.format, args.nb_chunk) + + # Write the chunks in numbered files. + write_chunks(chunks, args.output, filename, file_extension, args.format) + + +def gen_get_chunks(sequences_path, sequences_format, nb_chunk): + """[summary] + + Arguments: + sequences_path {[type]} -- [description] + sequences_format {[type]} -- [description] + nb_chunk {[type]} -- [description] + + Returns: + [type] -- [description] + """ + # First record to count the sequences + sequences_record_to_count = gen_sequence_record( + sequences_path, sequences_format) + # Get the number of sequences + nb_sequences = get_nb_sequences(sequences_record_to_count) + logger.info("%s = %i", "Number of sequences per chunk", nb_sequences) + # Second record to that will be splitted + sequences_to_split = gen_sequence_record(sequences_path, sequences_format) + + # Get the size of the chunks + chunk_size = int(nb_sequences / nb_chunk) if nb_sequences > nb_chunk else 1 + return gen_get_chunks_by_size(sequences_to_split, chunk_size) + + +def gen_get_chunks_by_size(iterable, size=10): + logger.info( + "%s = %i", + "chunk size got (could be different from parameter if more chunk asked \ + than sequences in multifasta)", + size + ) + iterator = iter(iterable) + for first in iterator: + yield chain([first], islice(iterator, size - 1)) + + +def gen_sequence_record(sequences_path, sequence_format): + return SeqIO.parse(sequences_path, sequence_format) + + +def get_nb_sequences(sequences): + return sum(1 for _ in sequences) + + +def write_chunks(iterable, dirname, filename, file_extension, sequence_format): + for idx, chunk in enumerate(iterable): + if not os.path.exists(dirname): + os.mkdir(dirname) + output_file = os.path.join( + dirname, filename + "-chunk-" + str(idx + 1) + file_extension + ) + with open(output_file, mode="w") as output_handle: + count_seq, seq_to_write = tee(chunk, 2) + logger.info( + "%s : number of seuquences = %i", output_file, len( + list(count_seq)) + ) + SeqIO.write(seq_to_write, output_handle, sequence_format) + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Split fasta/fastq files") + parser.add_argument( + "-s", "--sequences", type=str, help="File that contains the sequences" + ) + + parser.add_argument("-f", "--format", type=str, + help="File format (fastq, fasta)") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + "-c", "--chunk-size", + type=int, + help="The number of sequences by chunks." + ) + group.add_argument("-n", "--nb-chunk", help="Number of chunks", type=int) + + parser.add_argument( + "-o", + "--output", + type=str, + default="./", + help="The output directory where the chunks will be saved", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + logger.info("START") + main() + logger.info("FINISHED")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequence-splitter.xml Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,132 @@ +<tool id="sequence-splitter" name="Sequence Splitter" version="0.1.0"> + <requirements> + <requirement type="package" version="2.7.11">python</requirement> + <requirement type="package" version="1.73">biopython</requirement> + </requirements> + <edam_operations> + <edam_operation>operation_2409</edam_operation> + <edam_operation>operation_3359</edam_operation> + </edam_operations> + <edam_topics> + <edam_topic>topic_3307</edam_topic> + <edam_topic>topic_0080</edam_topic> + </edam_topics> + <command detect_errors="exit_code"><![CDATA[ + ln -s '$sequences' ./sequences.${sequences.ext} && + python + '$__tool_directory__/sequence-splitter.py' + -s ./sequences.$sequences.ext + #if $sequences.is_of_type("fasta") + -f fasta + #else + -f fastq + #end if + #if $split_mode.selector == 'chunk_size' + -c $split_mode.chunk_size + #else + -n $split_mode.nb_chunk + #end if + -o 'outputs' + ]]> </command> + <inputs> + <param type="data" name="sequences" format="fasta,fastq"/> + <conditional name="split_mode"> + <param name="selector" type="select" label="Split by:"> + <option value="chunk_size">Number of sequence(s) by chunk</option> + <option value="nb_chunk">Number of chunk(s)</option> + </param> + <when value="chunk_size"> + <param name="chunk_size" label="Chunk size" type="integer" min="1" value="100" help="How many sequences by chunk ?"/> + </when> + <when value="nb_chunk"> + <param name="nb_chunk" label="Number of chunks" type="integer" min="1" value="10" help="How many chunks ?"/> + </when> + </conditional> + </inputs> + <outputs> + <collection name="split_output" type="list" label="splitted sequences"> + <discover_datasets pattern="(?P<designation>\w+-chunk-\d+)\.(?P<ext>.+)" directory="outputs" /> + </collection> + <data format="text" name="logfile" from_work_dir="sequence-splitter.log" label="${tool.name} logfile on ${on_string}"/> + </outputs> + <tests> + <test> + <param name="sequences" value="sample.fasta"/> + <conditional name="split_mode"> + <param name="selector" value="chunk_size"/> + <param name="chunk_size" value="2"/> + </conditional> + <output_collection name="split_output" type="list" count="2"> + <element name="sequences-chunk-1" file="sample-chunk-1.fasta" ftype="fasta"/> + <element name="sequences-chunk-2" file="sample-chunk-2.fasta" ftype="fasta"/> + </output_collection> + </test> + <test> + <param name="sequences" value="sample.fastq"/> + <conditional name="split_mode"> + <param name="selector" value="chunk_size"/> + <param name="chunk_size" value="2"/> + </conditional> + <output_collection name="split_output" type="list" count="5"> + <element name="sequences-chunk-1" file="sample-chunk-1.fastq" ftype="fastqsanger"/> + <element name="sequences-chunk-2" file="sample-chunk-2.fastq" ftype="fastqsanger"/> + <element name="sequences-chunk-3" file="sample-chunk-3.fastq" ftype="fastqsanger"/> + <element name="sequences-chunk-4" file="sample-chunk-4.fastq" ftype="fastqsanger"/> + <element name="sequences-chunk-5" file="sample-chunk-5.fastq" ftype="fastqsanger"/> + </output_collection> + </test> + + <test> + <param name="sequences" value="sample.fastq"/> + <conditional name="split_mode"> + <param name="selector" value="nb_chunk" /> + <param name="nb_chunk" value="2"/> + </conditional> + <output_collection name="split_output" type="list" count="2"> + <element name="sequences-chunk-1" file="sample-3-chunk-1.fastq" ftype="fastqsanger"/> + <element name="sequences-chunk-2" file="sample-3-chunk-2.fastq" ftype="fastqsanger"/> + </output_collection> + </test> + <test> + <param name="sequences" value="sample.fasta"/> + <conditional name="split_mode"> + <param name="selector" value="nb_chunk" /> + <param name="nb_chunk" value="10"/> + </conditional> + <output_collection name="split_output" type="list" count="4"> + <element name="sequences-chunk-1" file="sample-2-chunk-1.fasta" ftype="fasta"/> + <element name="sequences-chunk-2" file="sample-2-chunk-2.fasta" ftype="fasta"/> + <element name="sequences-chunk-3" file="sample-2-chunk-3.fasta" ftype="fasta"/> + <element name="sequences-chunk-4" file="sample-2-chunk-4.fasta" ftype="fasta"/> + </output_collection> + </test> + + </tests> + <help><![CDATA[ +Usage: + + sequence-splitter.py [-h] [-s SEQUENCES] [-f FORMAT] (-c CHUNK_SIZE | -n NB_CHUNK) [-o OUTPUT] + +Split fasta/fastq files + +optional arguments: +-h, --help show this help message and exit + +-s SEQUENCES, --sequences SEQUENCES + File that contains the sequences + +-f FORMAT, --format FORMAT + File format (fastq, fasta) + +-c CHUNK_SIZE, --chunk-size CHUNK_SIZE + The number of sequences by chunks + +-n NB_CHUNK, --nb-chunk NB_CHUNK + Number of chunks + +-o OUTPUT, --output OUTPUT + The output directory where the chunks will be saved + + + ]]> </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-2-chunk-1.fasta Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,2 @@ +>derice +ACTGACTAGCTAGCTAACTG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-2-chunk-2.fasta Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,2 @@ +>sanka +GCATCGTAGCTAGCTACGAT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-2-chunk-3.fasta Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,2 @@ +>junior +CATCGATCGTACGTACGTAG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-2-chunk-4.fasta Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,2 @@ +>yul +ATCGATCGATCGTACGATCG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-3-chunk-1.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,20 @@ +@cluster_2:UMI_ATTCCG +TTTCCGGGGCACATAATCTTCAGCCGGGCGC ++ +9C;=;=<9@4868>9:67AA<9>65<=>591 +@cluster_8:UMI_CTTTGA +TATCCTTGCAATACTCTCCGAACGGGAGAGC ++ +1/04.72,(003,-2-22+00-12./.-.4- +@cluster_12:UMI_GGTCAA +GCAGTTTAAGATCATTTTATTGAAGAGCAAG ++ +?7?AEEC@>=1?A?EEEB9ECB?==:B.A?A +@cluster_21:UMI_AGAACA +GGCATTGCAAAATTTATTACACCCCCAGATC ++ +>=2.660/?:36AD;0<14703640334-// +@cluster_29:UMI_GCAGGA +CCCCCTTAAATAGCTGTTTATTTGGCCCCAG ++ +8;;;>DC@DAC=B?C@9?B?CDCB@><<??A
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-3-chunk-2.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,20 @@ +@cluster_34:UMI_AGCTCA +TCTTGCAAAAACTCCTAGATCGGAAGAGCAC ++ +-/CA:+<599803./2065?6=<>90;?150 +@cluster_36:UMI_AACAGA +TCCCCCCCCCAAATCGGAAAAACACACCCCC ++ +5?:5;<02:@977=:<0=9>@5>7>;>*3,- +@cluster_37:UMI_GAGGAG +GTCTTTGTACAAAATTTTATTAAAGGTCTTT ++ +?B?DEC@A=?ADDAEEEC?EC@D6A@@>DE4 +@cluster_39:UMI_GAACCG +CCTTCCATCACCAGATCGGAAAAACACACGC ++ +00>7;8@5<192?/8;0;;>=3=/3239713 +@cluster_43:UMI_GGATTG +GAGTTATAATCCAATCTTTATTTAAAAATCT ++ +>=AEC?C@;??0A>?0DEB9EEB@DDC1?=6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-chunk-1.fasta Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,4 @@ +>derice +ACTGACTAGCTAGCTAACTG +>sanka +GCATCGTAGCTAGCTACGAT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-chunk-1.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,8 @@ +@cluster_2:UMI_ATTCCG +TTTCCGGGGCACATAATCTTCAGCCGGGCGC ++ +9C;=;=<9@4868>9:67AA<9>65<=>591 +@cluster_8:UMI_CTTTGA +TATCCTTGCAATACTCTCCGAACGGGAGAGC ++ +1/04.72,(003,-2-22+00-12./.-.4-
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-chunk-2.fasta Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,4 @@ +>junior +CATCGATCGTACGTACGTAG +>yul +ATCGATCGATCGTACGATCG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-chunk-2.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,8 @@ +@cluster_12:UMI_GGTCAA +GCAGTTTAAGATCATTTTATTGAAGAGCAAG ++ +?7?AEEC@>=1?A?EEEB9ECB?==:B.A?A +@cluster_21:UMI_AGAACA +GGCATTGCAAAATTTATTACACCCCCAGATC ++ +>=2.660/?:36AD;0<14703640334-//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-chunk-3.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,8 @@ +@cluster_29:UMI_GCAGGA +CCCCCTTAAATAGCTGTTTATTTGGCCCCAG ++ +8;;;>DC@DAC=B?C@9?B?CDCB@><<??A +@cluster_34:UMI_AGCTCA +TCTTGCAAAAACTCCTAGATCGGAAGAGCAC ++ +-/CA:+<599803./2065?6=<>90;?150
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-chunk-4.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,8 @@ +@cluster_36:UMI_AACAGA +TCCCCCCCCCAAATCGGAAAAACACACCCCC ++ +5?:5;<02:@977=:<0=9>@5>7>;>*3,- +@cluster_37:UMI_GAGGAG +GTCTTTGTACAAAATTTTATTAAAGGTCTTT ++ +?B?DEC@A=?ADDAEEEC?EC@D6A@@>DE4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample-chunk-5.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,8 @@ +@cluster_39:UMI_GAACCG +CCTTCCATCACCAGATCGGAAAAACACACGC ++ +00>7;8@5<192?/8;0;;>=3=/3239713 +@cluster_43:UMI_GGATTG +GAGTTATAATCCAATCTTTATTTAAAAATCT ++ +>=AEC?C@;??0A>?0DEB9EEB@DDC1?=6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample.fasta Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,8 @@ +>derice +ACTGACTAGCTAGCTAACTG +>sanka +GCATCGTAGCTAGCTACGAT +>junior +CATCGATCGTACGTACGTAG +>yul +ATCGATCGATCGTACGATCG \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample.fastq Thu Aug 08 11:18:30 2019 -0400 @@ -0,0 +1,40 @@ +@cluster_2:UMI_ATTCCG +TTTCCGGGGCACATAATCTTCAGCCGGGCGC ++ +9C;=;=<9@4868>9:67AA<9>65<=>591 +@cluster_8:UMI_CTTTGA +TATCCTTGCAATACTCTCCGAACGGGAGAGC ++ +1/04.72,(003,-2-22+00-12./.-.4- +@cluster_12:UMI_GGTCAA +GCAGTTTAAGATCATTTTATTGAAGAGCAAG ++ +?7?AEEC@>=1?A?EEEB9ECB?==:B.A?A +@cluster_21:UMI_AGAACA +GGCATTGCAAAATTTATTACACCCCCAGATC ++ +>=2.660/?:36AD;0<14703640334-// +@cluster_29:UMI_GCAGGA +CCCCCTTAAATAGCTGTTTATTTGGCCCCAG ++ +8;;;>DC@DAC=B?C@9?B?CDCB@><<??A +@cluster_34:UMI_AGCTCA +TCTTGCAAAAACTCCTAGATCGGAAGAGCAC ++ +-/CA:+<599803./2065?6=<>90;?150 +@cluster_36:UMI_AACAGA +TCCCCCCCCCAAATCGGAAAAACACACCCCC ++ +5?:5;<02:@977=:<0=9>@5>7>;>*3,- +@cluster_37:UMI_GAGGAG +GTCTTTGTACAAAATTTTATTAAAGGTCTTT ++ +?B?DEC@A=?ADDAEEEC?EC@D6A@@>DE4 +@cluster_39:UMI_GAACCG +CCTTCCATCACCAGATCGGAAAAACACACGC ++ +00>7;8@5<192?/8;0;;>=3=/3239713 +@cluster_43:UMI_GGATTG +GAGTTATAATCCAATCTTTATTTAAAAATCT ++ +>=AEC?C@;??0A>?0DEB9EEB@DDC1?=6 \ No newline at end of file