Mercurial > repos > nml > stringmlst
comparison split_by_allele.py @ 1:4e03573653fe draft default tip
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
| author | nml |
|---|---|
| date | Tue, 19 Sep 2017 16:34:57 -0400 |
| parents | fc0f15ca12e0 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:fc0f15ca12e0 | 1:4e03573653fe |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 import getopt | 2 import getopt |
| 3 import os | |
| 3 import sys | 4 import sys |
| 4 import os | 5 |
| 5 from Bio import SeqIO | 6 from Bio import SeqIO |
| 6 | 7 |
| 7 def split_allele_file(alleles,profiles): | 8 ERROR_MSG = "Error could not parse out allele name and number from '%s'" |
| 8 | 9 |
| 10 | |
| 11 def split_allele_file(alleles, profiles): | |
| 12 | |
| 9 writers = {} | 13 writers = {} |
| 10 | 14 |
| 11 handle = open(alleles, "rU") | 15 handle = open(alleles, "rU") |
| 12 for record in SeqIO.parse(handle, "fasta"): | 16 for record in SeqIO.parse(handle, "fasta"): |
| 13 | 17 |
| 14 seqid=record.id | 18 seqid = record.id |
| 15 | 19 |
| 16 #split out the alelle name from the version number | 20 # split out the alelle name from the version number |
| 17 #attempting to split based on '-' first, if that fails, then '_' | 21 # attempting to split based on '-' first, if that fails, then '_' |
| 18 result = seqid.split('_') | 22 result = seqid.split('_') |
| 19 | 23 |
| 20 if len(result) !=2: | 24 if len(result) != 2: |
| 21 result = seqid.split('-') | 25 result = seqid.split('-') |
| 22 if len(result) ==2: | 26 if len(result) == 2: |
| 23 newid = '_'.join(result) | 27 newid = '_'.join(result) |
| 24 record.id = newid | 28 record.id = newid |
| 25 else: | 29 else: |
| 26 print "Error could not parse out allele name and number from '%s'" % seqid | 30 print(ERROR_MSG % seqid) |
| 27 exit(0) | 31 exit(0) |
| 28 | |
| 29 | |
| 30 name,num = result | |
| 31 | 32 |
| 33 name, num = result | |
| 32 | 34 |
| 33 #if writer exist, then write to that current fasta file | 35 # if writer exist, then write to that current fasta file |
| 34 if name in writers: | 36 if name in writers: |
| 35 SeqIO.write(record, writers[name], "fasta") | 37 SeqIO.write(record, writers[name], "fasta") |
| 36 else: | 38 else: |
| 37 #new allele found, create new writer and add the first record | 39 # new allele found, create new writer and add the first record |
| 38 file_name = name + '.fasta' | 40 file_name = name + '.fasta' |
| 39 output_fh = open(file_name, "w") | 41 output_fh = open(file_name, "w") |
| 40 SeqIO.write(record, output_fh, "fasta") | 42 SeqIO.write(record, output_fh, "fasta") |
| 41 writers[name] = output_fh | 43 writers[name] = output_fh |
| 42 | 44 |
| 43 handle.close() | 45 handle.close() |
| 44 | 46 |
| 45 #creat config file based on the alleles found | 47 # create config file based on the alleles found |
| 46 with open('config.txt','w') as cfile: | 48 with open('config.txt', 'w') as cfile: |
| 47 cfile.write("[loci]\n") | 49 cfile.write("[loci]\n") |
| 48 for name, writer in writers.iteritems() : | 50 for name, writer in writers.items(): |
| 49 path = os.path.realpath(writer.name) | 51 path = os.path.realpath(writer.name) |
| 50 cfile.write("%s\t%s\n" % (name,path)) | 52 cfile.write("%s\t%s\n" % (name, path)) |
| 51 cfile.write("[profile]\n") | 53 cfile.write("[profile]\n") |
| 52 cfile.write("profile\t%s\n" % profiles) | 54 cfile.write("profile\t%s\n" % profiles) |
| 53 | 55 |
| 54 | |
| 55 return | 56 return |
| 56 | 57 |
| 57 | 58 |
| 58 alleles=None | 59 alleles = None |
| 59 profiles=None | 60 profiles = None |
| 60 | 61 |
| 61 """Input arguments""" | 62 """Input arguments""" |
| 62 options, remainder = getopt.getopt(sys.argv[1:], '', [ | 63 options, remainder = getopt.getopt(sys.argv[1:], '', [ |
| 63 'alleles=', | 64 'alleles=', |
| 64 'profiles=' | 65 'profiles=' |
| 69 alleles = arg | 70 alleles = arg |
| 70 elif opt in ('--profiles'): | 71 elif opt in ('--profiles'): |
| 71 profiles = arg | 72 profiles = arg |
| 72 | 73 |
| 73 if alleles and profiles: | 74 if alleles and profiles: |
| 74 split_allele_file(alleles,profiles) | 75 split_allele_file(alleles, profiles) |
| 75 |
