annotate split_by_allele.py @ 1:4e03573653fe draft default tip

planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
author nml
date Tue, 19 Sep 2017 16:34:57 -0400
parents fc0f15ca12e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
1 #!/usr/bin/env python
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
2 import getopt
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
3 import os
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
4 import sys
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
5
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
6 from Bio import SeqIO
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
7
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
8 ERROR_MSG = "Error could not parse out allele name and number from '%s'"
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
9
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
10
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
11 def split_allele_file(alleles, profiles):
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
12
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
13 writers = {}
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
14
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
15 handle = open(alleles, "rU")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
16 for record in SeqIO.parse(handle, "fasta"):
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
17
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
18 seqid = record.id
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
19
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
20 # split out the alelle name from the version number
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
21 # attempting to split based on '-' first, if that fails, then '_'
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
22 result = seqid.split('_')
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
23
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
24 if len(result) != 2:
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
25 result = seqid.split('-')
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
26 if len(result) == 2:
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
27 newid = '_'.join(result)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
28 record.id = newid
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
29 else:
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
30 print(ERROR_MSG % seqid)
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
31 exit(0)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
32
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
33 name, num = result
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
34
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
35 # if writer exist, then write to that current fasta file
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
36 if name in writers:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
37 SeqIO.write(record, writers[name], "fasta")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
38 else:
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
39 # new allele found, create new writer and add the first record
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
40 file_name = name + '.fasta'
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
41 output_fh = open(file_name, "w")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
42 SeqIO.write(record, output_fh, "fasta")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
43 writers[name] = output_fh
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
44
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
45 handle.close()
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
46
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
47 # create config file based on the alleles found
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
48 with open('config.txt', 'w') as cfile:
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
49 cfile.write("[loci]\n")
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
50 for name, writer in writers.items():
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
51 path = os.path.realpath(writer.name)
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
52 cfile.write("%s\t%s\n" % (name, path))
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
53 cfile.write("[profile]\n")
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
54 cfile.write("profile\t%s\n" % profiles)
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
55
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
56 return
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
57
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
58
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
59 alleles = None
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
60 profiles = None
0
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
61
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
62 """Input arguments"""
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
63 options, remainder = getopt.getopt(sys.argv[1:], '', [
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
64 'alleles=',
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
65 'profiles='
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
66 ])
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
67
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
68 for opt, arg in options:
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
69 if opt in ('--alleles'):
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
70 alleles = arg
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
71 elif opt in ('--profiles'):
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
72 profiles = arg
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
73
fc0f15ca12e0 planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff changeset
74 if alleles and profiles:
1
4e03573653fe planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents: 0
diff changeset
75 split_allele_file(alleles, profiles)