Mercurial > repos > rnateam > splitfasta
comparison split_fasta.py @ 5:733ca84b21ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
author | rnateam |
---|---|
date | Mon, 21 Sep 2020 15:40:14 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:ae4d5733272f | 5:733ca84b21ee |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 from Bio import SeqIO | |
6 | |
7 num_chunks = 0 | |
8 if len(sys.argv) == 3: | |
9 num_chunks = int(sys.argv[2]) | |
10 input_filename = sys.argv[1] | |
11 elif len(sys.argv) == 2: | |
12 input_filename = sys.argv[1] | |
13 else: | |
14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]") | |
15 | |
16 os.mkdir('splits') | |
17 | |
18 if num_chunks != 0: | |
19 # if splitting into chunks we need to count how many records are in the | |
20 # input file | |
21 record_count = 0 | |
22 with open(input_filename) as input_file: | |
23 for line in input_file: | |
24 if line.lstrip().startswith('>'): | |
25 record_count += 1 | |
26 | |
27 records_per_chunk = round(float(record_count) / num_chunks) | |
28 | |
29 count = 1 | |
30 with open(input_filename) as input_file: | |
31 | |
32 chunk_record_count = 0 # how many lines have we written to the output file | |
33 records = [] | |
34 for record in SeqIO.parse(input_file, 'fasta'): | |
35 records.append(record) | |
36 if num_chunks == 0 or (count < num_chunks and | |
37 len(records) >= records_per_chunk): | |
38 if num_chunks == 0: | |
39 output_filename = os.path.join('splits', record.id) | |
40 else: | |
41 output_filename = os.path.join('splits', 'part{}'.format(count)) | |
42 SeqIO.write(records, output_filename, 'fasta') | |
43 count += 1 | |
44 records = [] | |
45 | |
46 if records: | |
47 # this only applies for the mode where input file is | |
48 # split into chunks | |
49 output_filename = os.path.join('splits', 'part{}'.format(count)) | |
50 SeqIO.write(records, output_filename, 'fasta') |