Mercurial > repos > rnateam > splitfasta
annotate split_fasta.py @ 5:733ca84b21ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
author | rnateam |
---|---|
date | Mon, 21 Sep 2020 15:40:14 +0000 |
parents | |
children |
rev | line source |
---|---|
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
1 #!/usr/bin/env python |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
2 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
3 import os |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
4 import sys |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
5 from Bio import SeqIO |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
6 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
7 num_chunks = 0 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
8 if len(sys.argv) == 3: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
9 num_chunks = int(sys.argv[2]) |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
10 input_filename = sys.argv[1] |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
11 elif len(sys.argv) == 2: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
12 input_filename = sys.argv[1] |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
13 else: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]") |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
15 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
16 os.mkdir('splits') |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
17 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
18 if num_chunks != 0: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
19 # if splitting into chunks we need to count how many records are in the |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
20 # input file |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
21 record_count = 0 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
22 with open(input_filename) as input_file: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
23 for line in input_file: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
24 if line.lstrip().startswith('>'): |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
25 record_count += 1 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
26 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
27 records_per_chunk = round(float(record_count) / num_chunks) |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
28 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
29 count = 1 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
30 with open(input_filename) as input_file: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
31 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
32 chunk_record_count = 0 # how many lines have we written to the output file |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
33 records = [] |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
34 for record in SeqIO.parse(input_file, 'fasta'): |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
35 records.append(record) |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
36 if num_chunks == 0 or (count < num_chunks and |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
37 len(records) >= records_per_chunk): |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
38 if num_chunks == 0: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
39 output_filename = os.path.join('splits', record.id) |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
40 else: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
41 output_filename = os.path.join('splits', 'part{}'.format(count)) |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
42 SeqIO.write(records, output_filename, 'fasta') |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
43 count += 1 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
44 records = [] |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
45 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
46 if records: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
47 # this only applies for the mode where input file is |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
48 # split into chunks |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
49 output_filename = os.path.join('splits', 'part{}'.format(count)) |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
50 SeqIO.write(records, output_filename, 'fasta') |