Mercurial > repos > nml > stringmlst
annotate split_by_allele.py @ 0:fc0f15ca12e0 draft
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
author | nml |
---|---|
date | Mon, 24 Oct 2016 13:15:20 -0400 |
parents | |
children | 4e03573653fe |
rev | line source |
---|---|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
1 #!/usr/bin/env python |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
2 import getopt |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
3 import sys |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
4 import os |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
5 from Bio import SeqIO |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
6 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
7 def split_allele_file(alleles,profiles): |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
8 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
9 writers = {} |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
10 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
11 handle = open(alleles, "rU") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
12 for record in SeqIO.parse(handle, "fasta"): |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
13 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
14 seqid=record.id |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
15 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
16 #split out the alelle name from the version number |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
17 #attempting to split based on '-' first, if that fails, then '_' |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
18 result = seqid.split('_') |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
19 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
20 if len(result) !=2: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
21 result = seqid.split('-') |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
22 if len(result) ==2: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
23 newid = '_'.join(result) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
24 record.id = newid |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
25 else: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
26 print "Error could not parse out allele name and number from '%s'" % seqid |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
27 exit(0) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
28 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
29 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
30 name,num = result |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
31 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
32 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
33 #if writer exist, then write to that current fasta file |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
34 if name in writers: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
35 SeqIO.write(record, writers[name], "fasta") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
36 else: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
37 #new allele found, create new writer and add the first record |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
38 file_name = name + '.fasta' |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
39 output_fh = open(file_name, "w") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
40 SeqIO.write(record, output_fh, "fasta") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
41 writers[name] = output_fh |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
42 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
43 handle.close() |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
44 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
45 #creat config file based on the alleles found |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
46 with open('config.txt','w') as cfile: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
47 cfile.write("[loci]\n") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
48 for name, writer in writers.iteritems() : |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
49 path = os.path.realpath(writer.name) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
50 cfile.write("%s\t%s\n" % (name,path)) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
51 cfile.write("[profile]\n") |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
52 cfile.write("profile\t%s\n" % profiles) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
53 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
54 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
55 return |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
56 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
57 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
58 alleles=None |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
59 profiles=None |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
60 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
61 """Input arguments""" |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
62 options, remainder = getopt.getopt(sys.argv[1:], '', [ |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
63 'alleles=', |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
64 'profiles=' |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
65 ]) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
66 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
67 for opt, arg in options: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
68 if opt in ('--alleles'): |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
69 alleles = arg |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
70 elif opt in ('--profiles'): |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
71 profiles = arg |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
72 |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
73 if alleles and profiles: |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
74 split_allele_file(alleles,profiles) |
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
75 |