Mercurial > repos > rnateam > splitfasta
changeset 6:7521d865e770 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
author | bgruening |
---|---|
date | Tue, 14 Jan 2025 21:52:36 +0000 |
parents | 733ca84b21ee |
children | |
files | splitFasta.xml split_fasta.py |
diffstat | 2 files changed, 36 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/splitFasta.xml Mon Sep 21 15:40:14 2020 +0000 +++ b/splitFasta.xml Tue Jan 14 21:52:36 2025 +0000 @@ -1,4 +1,4 @@ -<tool id="rbc_splitfasta" name="Split Fasta" version="0.4.0"> +<tool id="rbc_splitfasta" name="Split Fasta" version="0.5.0" profile="23.0"> <description>files into a collection</description> <requirements> <requirement type="package" version="1.76">biopython</requirement> @@ -6,9 +6,9 @@ <command detect_errors="aggressive"> <![CDATA[ #if $splitmode.splitmode_select == "each": - python $__tool_directory__/split_fasta.py '$inputFile' + python $__tool_directory__/split_fasta.py --records '$inputFile.metadata.sequences' --limit "\${GALAXY_FILE_LIMIT:-0}" '$inputFile' #else if $splitmode.splitmode_select == "chunks": - python $__tool_directory__/split_fasta.py '$inputFile' $splitmode.num_chunks + python $__tool_directory__/split_fasta.py --records '$inputFile.metadata.sequences' --limit "\${GALAXY_FILE_LIMIT:-0}" --num-chunks '$splitmode.num_chunks' '$inputFile' #end if ]]></command> <inputs>
--- a/split_fasta.py Mon Sep 21 15:40:14 2020 +0000 +++ b/split_fasta.py Tue Jan 14 21:52:36 2025 +0000 @@ -1,50 +1,61 @@ #!/usr/bin/env python +import argparse import os -import sys + from Bio import SeqIO -num_chunks = 0 -if len(sys.argv) == 3: - num_chunks = int(sys.argv[2]) - input_filename = sys.argv[1] -elif len(sys.argv) == 2: - input_filename = sys.argv[1] -else: - exit("Usage: split_fasta.py <input_filename> [<num_chunks>]") +parser = argparse.ArgumentParser() +parser.add_argument("--records", type=int, default=None) +parser.add_argument("--limit", type=int, default=None) +parser.add_argument("--num-chunks", type=int, default=0) +parser.add_argument("input_file") +args = parser.parse_args() -os.mkdir('splits') +input_filename = args.input_file +num_chunks = args.num_chunks +record_count = args.records +record_limit = args.limit -if num_chunks != 0: - # if splitting into chunks we need to count how many records are in the - # input file +os.mkdir("splits") + +if record_limit and num_chunks > record_limit: + exit(f"ERROR: Requested number of chunks {num_chunks} exceeds limit {record_limit}") + +if not record_count and (num_chunks != 0 or record_limit): + # if no count is provided and if splitting into chunks or a limit is set, we need to count how many records are in the input file record_count = 0 with open(input_filename) as input_file: for line in input_file: - if line.lstrip().startswith('>'): + if line.lstrip().startswith(">"): record_count += 1 +if num_chunks != 0: records_per_chunk = round(float(record_count) / num_chunks) +if record_limit and record_count > record_limit: + exit(f"ERROR: Number of sequences {record_count} exceeds limit {record_limit}") + count = 1 with open(input_filename) as input_file: chunk_record_count = 0 # how many lines have we written to the output file records = [] - for record in SeqIO.parse(input_file, 'fasta'): + for record in SeqIO.parse(input_file, "fasta"): records.append(record) - if num_chunks == 0 or (count < num_chunks and - len(records) >= records_per_chunk): + if num_chunks == 0 or ( + count < num_chunks and len(records) >= records_per_chunk + ): if num_chunks == 0: - output_filename = os.path.join('splits', record.id) + output_filename = os.path.join("splits", record.id) else: - output_filename = os.path.join('splits', 'part{}'.format(count)) - SeqIO.write(records, output_filename, 'fasta') + output_filename = os.path.join("splits", "part{}".format(count)) + SeqIO.write(records, output_filename, "fasta") count += 1 records = [] if records: # this only applies for the mode where input file is # split into chunks - output_filename = os.path.join('splits', 'part{}'.format(count)) - SeqIO.write(records, output_filename, 'fasta') + output_filename = os.path.join("splits", "part{}".format(count)) + SeqIO.write(records, output_filename, "fasta")