annotate split_by_allele.py @ 0:fc0f15ca12e0 draft

planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
author nml
date Mon, 24 Oct 2016 13:15:20 -0400
parents
children 4e03573653fe
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
1 #!/usr/bin/env python
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
2 import getopt
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
3 import sys
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
4 import os
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
5 from Bio import SeqIO
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
6
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
7 def split_allele_file(alleles,profiles):
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
8
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
9 writers = {}
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
10
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
11 handle = open(alleles, "rU")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
12 for record in SeqIO.parse(handle, "fasta"):
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
13
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
14 seqid=record.id
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
15
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
16 #split out the alelle name from the version number
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
17 #attempting to split based on '-' first, if that fails, then '_'
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
18 result = seqid.split('_')
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
19
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
20 if len(result) !=2:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
21 result = seqid.split('-')
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
22 if len(result) ==2:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
23 newid = '_'.join(result)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
24 record.id = newid
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
25 else:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
26 print "Error could not parse out allele name and number from '%s'" % seqid
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
27 exit(0)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
28
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
29
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
30 name,num = result
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
31
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
32
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
33 #if writer exist, then write to that current fasta file
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
34 if name in writers:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
35 SeqIO.write(record, writers[name], "fasta")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
36 else:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
37 #new allele found, create new writer and add the first record
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
38 file_name = name + '.fasta'
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
39 output_fh = open(file_name, "w")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
40 SeqIO.write(record, output_fh, "fasta")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
41 writers[name] = output_fh
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
42
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
43 handle.close()
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
44
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
45 #creat config file based on the alleles found
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
46 with open('config.txt','w') as cfile:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
47 cfile.write("[loci]\n")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
48 for name, writer in writers.iteritems() :
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
49 path = os.path.realpath(writer.name)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
50 cfile.write("%s\t%s\n" % (name,path))
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
51 cfile.write("[profile]\n")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
52 cfile.write("profile\t%s\n" % profiles)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
53
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
54
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
55 return
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
56
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
57
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
58 alleles=None
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
59 profiles=None
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
60
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
61 """Input arguments"""
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
62 options, remainder = getopt.getopt(sys.argv[1:], '', [
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
63 'alleles=',
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
64 'profiles='
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
65 ])
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
66
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
67 for opt, arg in options:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
68 if opt in ('--alleles'):
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
69 alleles = arg
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
70 elif opt in ('--profiles'):
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
71 profiles = arg
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
72
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
73 if alleles and profiles:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
74 split_allele_file(alleles,profiles)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
75