Mercurial > repos > rnateam > splitfasta

diff split_fasta.py @ 5:733ca84b21ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
author: rnateam
date: Mon, 21 Sep 2020 15:40:14 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split_fasta.py	Mon Sep 21 15:40:14 2020 +0000
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from Bio import SeqIO
+
+num_chunks = 0
+if len(sys.argv) == 3:
+    num_chunks = int(sys.argv[2])
+    input_filename = sys.argv[1]
+elif len(sys.argv) == 2:
+    input_filename = sys.argv[1]
+else:
+    exit("Usage: split_fasta.py <input_filename> [<num_chunks>]")
+
+os.mkdir('splits')
+
+if num_chunks != 0:
+    # if splitting into chunks we need to count how many records are in the
+    # input file
+    record_count = 0
+    with open(input_filename) as input_file:
+        for line in input_file:
+            if line.lstrip().startswith('>'):
+                record_count += 1
+
+    records_per_chunk = round(float(record_count) / num_chunks)
+
+count = 1
+with open(input_filename) as input_file:
+
+    chunk_record_count = 0  # how many lines have we written to the output file
+    records = []
+    for record in SeqIO.parse(input_file, 'fasta'):
+        records.append(record)
+        if num_chunks == 0 or (count < num_chunks and
+           len(records) >= records_per_chunk):
+            if num_chunks == 0:
+                output_filename = os.path.join('splits', record.id)
+            else:
+                output_filename = os.path.join('splits', 'part{}'.format(count))
+            SeqIO.write(records, output_filename, 'fasta')
+            count += 1
+            records = []
+
+    if records:
+        # this only applies for the mode where input file is
+        # split into chunks
+        output_filename = os.path.join('splits', 'part{}'.format(count))
+        SeqIO.write(records, output_filename, 'fasta')
author	rnateam
date	Mon, 21 Sep 2020 15:40:14 +0000
parents
children