Mercurial > repos > nml > stringmlst
annotate split_by_allele.py @ 1:4e03573653fe draft default tip
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
author | nml |
---|---|
date | Tue, 19 Sep 2017 16:34:57 -0400 |
parents | fc0f15ca12e0 |
children |
rev | line source |
---|---|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
1 #!/usr/bin/env python |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
2 import getopt |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
3 import os |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
4 import sys |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
5 |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
6 from Bio import SeqIO |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
7 |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
8 ERROR_MSG = "Error could not parse out allele name and number from '%s'" |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
9 |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
10 |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
11 def split_allele_file(alleles, profiles): |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
12 |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
13 writers = {} |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
14 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
15 handle = open(alleles, "rU") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
16 for record in SeqIO.parse(handle, "fasta"): |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
17 |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
18 seqid = record.id |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
19 |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
20 # split out the alelle name from the version number |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
21 # attempting to split based on '-' first, if that fails, then '_' |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
22 result = seqid.split('_') |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
23 |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
24 if len(result) != 2: |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
25 result = seqid.split('-') |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
26 if len(result) == 2: |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
27 newid = '_'.join(result) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
28 record.id = newid |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
29 else: |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
30 print(ERROR_MSG % seqid) |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
31 exit(0) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
32 |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
33 name, num = result |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
34 |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
35 # if writer exist, then write to that current fasta file |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
36 if name in writers: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
37 SeqIO.write(record, writers[name], "fasta") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
38 else: |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
39 # new allele found, create new writer and add the first record |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
40 file_name = name + '.fasta' |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
41 output_fh = open(file_name, "w") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
42 SeqIO.write(record, output_fh, "fasta") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
43 writers[name] = output_fh |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
44 |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
45 handle.close() |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
46 |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
47 # create config file based on the alleles found |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
48 with open('config.txt', 'w') as cfile: |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
49 cfile.write("[loci]\n") |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
50 for name, writer in writers.items(): |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
51 path = os.path.realpath(writer.name) |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
52 cfile.write("%s\t%s\n" % (name, path)) |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
53 cfile.write("[profile]\n") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
54 cfile.write("profile\t%s\n" % profiles) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
55 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
56 return |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
57 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
58 |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
59 alleles = None |
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
60 profiles = None |
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
61 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
62 """Input arguments""" |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
63 options, remainder = getopt.getopt(sys.argv[1:], '', [ |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
64 'alleles=', |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
65 'profiles=' |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
66 ]) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
67 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
68 for opt, arg in options: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
69 if opt in ('--alleles'): |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
70 alleles = arg |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
71 elif opt in ('--profiles'): |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
72 profiles = arg |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
73 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
74 if alleles and profiles: |
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
75 split_allele_file(alleles, profiles) |