Mercurial > repos > rnateam > splitfasta
diff split_fasta.py @ 5:733ca84b21ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
author | rnateam |
---|---|
date | Mon, 21 Sep 2020 15:40:14 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split_fasta.py Mon Sep 21 15:40:14 2020 +0000 @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +import os +import sys +from Bio import SeqIO + +num_chunks = 0 +if len(sys.argv) == 3: + num_chunks = int(sys.argv[2]) + input_filename = sys.argv[1] +elif len(sys.argv) == 2: + input_filename = sys.argv[1] +else: + exit("Usage: split_fasta.py <input_filename> [<num_chunks>]") + +os.mkdir('splits') + +if num_chunks != 0: + # if splitting into chunks we need to count how many records are in the + # input file + record_count = 0 + with open(input_filename) as input_file: + for line in input_file: + if line.lstrip().startswith('>'): + record_count += 1 + + records_per_chunk = round(float(record_count) / num_chunks) + +count = 1 +with open(input_filename) as input_file: + + chunk_record_count = 0 # how many lines have we written to the output file + records = [] + for record in SeqIO.parse(input_file, 'fasta'): + records.append(record) + if num_chunks == 0 or (count < num_chunks and + len(records) >= records_per_chunk): + if num_chunks == 0: + output_filename = os.path.join('splits', record.id) + else: + output_filename = os.path.join('splits', 'part{}'.format(count)) + SeqIO.write(records, output_filename, 'fasta') + count += 1 + records = [] + + if records: + # this only applies for the mode where input file is + # split into chunks + output_filename = os.path.join('splits', 'part{}'.format(count)) + SeqIO.write(records, output_filename, 'fasta')