diff split_by_allele.py @ 0:fc0f15ca12e0 draft

planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
author nml
date Mon, 24 Oct 2016 13:15:20 -0400
parents
children 4e03573653fe
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split_by_allele.py	Mon Oct 24 13:15:20 2016 -0400
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+import getopt
+import sys
+import os
+from Bio import SeqIO
+
+def split_allele_file(alleles,profiles):
+    
+    writers = {}
+
+    handle = open(alleles, "rU")
+    for record in SeqIO.parse(handle, "fasta"):
+        
+        seqid=record.id
+        
+        #split out the alelle name from the version number
+        #attempting to split based on '-' first, if that fails, then '_'
+        result = seqid.split('_')
+        
+        if len(result) !=2:
+            result = seqid.split('-')
+            if len(result) ==2:
+                newid = '_'.join(result)
+                record.id = newid
+            else:
+                print "Error could not parse out allele name and number from '%s'" % seqid
+                exit(0)
+            
+                
+        name,num = result
+
+
+        #if writer exist, then write to that current fasta file
+        if name in writers:
+            SeqIO.write(record, writers[name], "fasta")
+        else:
+            #new allele found, create new writer and add the first record
+            file_name = name + '.fasta'
+            output_fh = open(file_name, "w")
+            SeqIO.write(record, output_fh, "fasta")
+            writers[name] = output_fh
+            
+    handle.close()
+
+    #creat config file based on the alleles found
+    with open('config.txt','w') as cfile:
+        cfile.write("[loci]\n")
+        for name, writer in writers.iteritems() :
+            path = os.path.realpath(writer.name)
+            cfile.write("%s\t%s\n" % (name,path))
+        cfile.write("[profile]\n")
+        cfile.write("profile\t%s\n" % profiles)
+
+        
+    return
+
+
+alleles=None
+profiles=None
+
+"""Input arguments"""
+options, remainder = getopt.getopt(sys.argv[1:], '', [
+ 'alleles=',
+ 'profiles='
+])
+
+for opt, arg in options:
+    if opt in ('--alleles'):
+        alleles = arg
+    elif opt in ('--profiles'):
+        profiles = arg
+
+if alleles and profiles:
+    split_allele_file(alleles,profiles)
+