annotate split_fasta.py @ 5:733ca84b21ee draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
author rnateam
date Mon, 21 Sep 2020 15:40:14 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
2
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
3 import os
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
4 import sys
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
5 from Bio import SeqIO
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
6
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
7 num_chunks = 0
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
8 if len(sys.argv) == 3:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
9 num_chunks = int(sys.argv[2])
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
10 input_filename = sys.argv[1]
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
11 elif len(sys.argv) == 2:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
12 input_filename = sys.argv[1]
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
13 else:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]")
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
15
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
16 os.mkdir('splits')
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
17
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
18 if num_chunks != 0:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
19 # if splitting into chunks we need to count how many records are in the
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
20 # input file
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
21 record_count = 0
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
22 with open(input_filename) as input_file:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
23 for line in input_file:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
24 if line.lstrip().startswith('>'):
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
25 record_count += 1
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
26
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
27 records_per_chunk = round(float(record_count) / num_chunks)
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
28
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
29 count = 1
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
30 with open(input_filename) as input_file:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
31
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
32 chunk_record_count = 0 # how many lines have we written to the output file
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
33 records = []
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
34 for record in SeqIO.parse(input_file, 'fasta'):
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
35 records.append(record)
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
36 if num_chunks == 0 or (count < num_chunks and
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
37 len(records) >= records_per_chunk):
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
38 if num_chunks == 0:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
39 output_filename = os.path.join('splits', record.id)
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
40 else:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
41 output_filename = os.path.join('splits', 'part{}'.format(count))
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
42 SeqIO.write(records, output_filename, 'fasta')
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
43 count += 1
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
44 records = []
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
45
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
46 if records:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
47 # this only applies for the mode where input file is
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
48 # split into chunks
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
49 output_filename = os.path.join('splits', 'part{}'.format(count))
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
50 SeqIO.write(records, output_filename, 'fasta')