annotate data_manager/fetch_mlst_data.py @ 0:25d4d9f313a0 draft default tip

Uploaded
author ulfschaefer
date Wed, 13 Jul 2016 05:50:48 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
1 #!/usr/bin/env python
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
2
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
3 '''
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
4 Download MLST datasets from this site: http://pubmlst.org/data/ by
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
5 parsing an xml file (http://pubmlst.org/data/dbases.xml).
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
6
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
7 Data is downloaded for a species determined by the user:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
8 - profiles (maps STs to allele numbers)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
9 - numbered sequences for each locus in the scheme
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
10
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
11 In addition, the alleles are concatenated together for use with SRST2.
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
12
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
13 A log file is also generated in the working directory, detailing the
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
14 time, date and location of all files downloaded, as well as the <retrieved>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
15 tag which tells us when the XML entry was last updated.
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
16
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
17 If the species name input by the user matches multiple <species> in the
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
18 xml file, the script simply reports the possible matches so the user can
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
19 try again.
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
20 '''
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
21
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
22 """
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
23 - Remove empty line at the end of profiles.txt file.
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
24 - Ensure the allele names at the profiles.txt file don't contain "_".
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
25
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
26 """
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
27 from argparse import ArgumentParser
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
28 import xml.dom.minidom as xml
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
29 import urllib2 as url
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
30 import re
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
31 import os
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
32 import sys
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
33 import glob
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
34 import csv
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
35 import shutil
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
36 from urlparse import urlparse
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
37 import time
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
38 import subprocess
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
39 from json import dumps
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
40 from json import loads
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
41
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
42 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
43
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
44 def parse_args():
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
45 parser = ArgumentParser(description='Download MLST datasets by species'
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
46 'from pubmlst.org.')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
47
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
48 parser.add_argument('--repository_url',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
49 metavar = 'URL',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
50 default = 'http://pubmlst.org/data/dbases.xml',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
51 help = 'URL for MLST repository XML index')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
52
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
53 parser.add_argument('--species',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
54 metavar = 'NAME',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
55 required = True,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
56 help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
57
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
58 parser.add_argument('--outfile',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
59 metavar = 'FILE',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
60 required = True,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
61 help = 'The name of the Json file to write that galaxy stuff to.')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
62
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
63 parser.add_argument('--reference',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
64 metavar = 'ACCESSION',
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
65 required = True,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
66 help = 'NCBI accession number of the reference genome to use for flanking regions.')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
67
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
68 return parser.parse_args()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
69
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
70 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
71
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
72 def main():
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
73
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
74 """
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
75 <species>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
76 Achromobacter spp.
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
77 <mlst>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
78 <database>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
79 <url>http://pubmlst.org/achromobacter</url>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
80 <retrieved>2015-08-11</retrieved>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
81 <profiles>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
82 <count>272</count>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
83 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
84 </profiles>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
85 <loci>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
86 <locus>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
87 nusA
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
88 <url>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
89 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
90 </url>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
91 </locus>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
92 <locus>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
93 rpoB
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
94 <url>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
95 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
96 </url>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
97 </locus>
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
98 """
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
99
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
100 args = parse_args()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
101 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
102
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
103 doc = xml.parse(docFile)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
104 root = doc.childNodes[0]
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
105 found_species = []
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
106
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
107 if args.species == "Escherichia coli":
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
108 args.species = "Escherichia coli#1"
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
109 elif args.species == "Acinetobacter baumannii":
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
110 args.species = "Acinetobacter baumannii#1"
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
111 elif args.species == "Pasteurella multocida":
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
112 args.species = "Pasteurella multocida#1"
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
113 else:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
114 pass
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
115
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
116 for species_node in root.getElementsByTagName('species'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
117 info = getSpeciesInfo(species_node, args.species)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
118 if info != None:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
119 found_species.append(info)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
120
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
121 if len(found_species) == 0:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
122 sys.stderr.write("No species matched your query.\n")
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
123 exit(1)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
124
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
125 if len(found_species) > 1:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
126 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species)))
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
127 for info in found_species:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
128 sys.stderr.write(info.name + '\n')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
129 exit(2)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
130
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
131 # output information for the single matching species
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
132 assert len(found_species) == 1
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
133 species_info = found_species[0]
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
134 species_name_underscores = species_info.name.replace(' ', '_')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
135 timestamp = time.strftime("%Y%m%d%H%M%S")
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
136
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
137 params = loads(open(args.outfile).read())
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
138 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
139
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
140 if not os.path.isdir(folder):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
141 os.makedirs(folder)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
142
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
143 profile_doc = url.urlopen(species_info.profiles_url)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
144 with open(os.path.join(folder, 'profiles.txt'), 'w') as f:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
145 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt')))
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
146 for line in profile_doc.readlines():
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
147 cols = line.split("\t")
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
148 f.write("%s\n" % ('\t'.join(cols[0:8])))
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
149 profile_doc.close()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
150
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
151 for locus in species_info.loci:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
152 locus_path = urlparse(locus.url).path
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
153 locus_filename = locus_path.split('/')[-1]
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
154 locus_filename = locus_filename.replace("_.tfa", ".fas")
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
155 locus_filename = locus_filename.replace("tfa", "fas")
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
156 locus_doc = url.urlopen(locus.url)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
157 with open(os.path.join(folder, locus_filename), 'w') as locus_file:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
158 locus_fasta_content = locus_doc.read()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
159 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-")
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
160 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename)))
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
161 locus_file.write(locus_fasta_content)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
162 locus_doc.close()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
163
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
164 get_reference(folder, args.reference)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
165
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
166
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
167 # do Galaxy stuff
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
168 data_manager_dict = {}
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
169 data_manager_dict['data_tables'] = {}
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
170 name = "%s-%s" % (species_info.name, timestamp)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
171 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
172 dbkey=species_name_underscores,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
173 name=name,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
174 time_stamp=timestamp,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
175 file_path=folder)]
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
176 #save info to json file
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
177 with open(args.outfile, 'wb') as fjson:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
178 fjson.write(dumps(data_manager_dict))
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
179
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
180 # end of main --------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
181
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
182 def get_reference(folder, acc):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
183
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
184 # We're getting this file from Japan!
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
185 # It seems to work pretty well until they take down or change their website
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
186 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
187 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
188 remote_ref = url.urlopen(refurl)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
189 ref_filename = os.path.join(folder, 'reference.seq')
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
190 with open(ref_filename, 'wb') as fRef:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
191 fRef.write(remote_ref.read())
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
192 remote_ref.close()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
193
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
194 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
195 % (ref_filename, ref_filename.replace("reference.seq", "reference"))
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
196 p = subprocess.Popen(cmd,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
197 shell=True,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
198 stdin=None,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
199 stdout=subprocess.PIPE,
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
200 stderr=subprocess.PIPE, close_fds=True)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
201 p.wait()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
202
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
203 return
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
204
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
205 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
206
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
207 # test if a node is an Element and that it has a specific tag name
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
208 def testElementTag(node, name):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
209 return node.nodeType == node.ELEMENT_NODE and node.localName == name
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
210
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
211 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
212
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
213 # Get the text from an element node with a text node child
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
214 def getText(element):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
215 result = ''
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
216 for node in element.childNodes:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
217 if node.nodeType == node.TEXT_NODE:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
218 result += node.data
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
219 return normaliseText(result)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
220
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
221 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
222
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
223 # remove unwanted whitespace including linebreaks etc.
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
224 def normaliseText(str):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
225 return ' '.join(str.split())
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
226
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
227 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
228
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
229 # A collection of interesting information about a taxa
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
230 class SpeciesInfo(object):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
231 def __init__(self):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
232 self.name = None # String name of species
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
233 self.database_url = None # URL as string
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
234 self.retrieved = None # date as string
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
235 self.profiles_url = None # URL as string
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
236 self.profiles_count = None # positive integer
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
237 self.loci = [] # list of loci
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
238
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
239 def __str__(self):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
240 s = "Name: %s\n" % self.name
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
241 s += "Database URL: %s\n" % self.database_url
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
242 s += "Retrieved: %s\n" % self.retrieved
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
243 s += "Profiles URL: %s\n" % self.profiles_url
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
244 s += "Profiles count: %s\n" % self.profiles_count
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
245 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci]))
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
246 return s
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
247
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
248 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
249
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
250 class LocusInfo(object):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
251 def __init__(self):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
252 self.url = None
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
253 self.name = None
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
254 def __str__(self):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
255 return "Locus: name:%s,url:%s" % (self.name, self.url)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
256
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
257 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
258
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
259 # retrieve the interesting information for a given sample element
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
260 def getSpeciesInfo(species_node, species):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
261 this_name = getText(species_node)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
262 print this_name
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
263 if this_name.startswith(species):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
264 info = SpeciesInfo()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
265 info.name = this_name
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
266 for mlst_node in species_node.getElementsByTagName('mlst'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
267 for database_node in mlst_node.getElementsByTagName('database'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
268 for database_child_node in database_node.childNodes:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
269 if testElementTag(database_child_node, 'url'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
270 info.database_url = getText(database_child_node)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
271 elif testElementTag(database_child_node, 'retrieved'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
272 info.retrieved = getText(database_child_node)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
273 elif testElementTag(database_child_node, 'profiles'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
274 for profile_count in database_child_node.getElementsByTagName('count'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
275 info.profiles_count = getText(profile_count)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
276 for profile_url in database_child_node.getElementsByTagName('url'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
277 info.profiles_url = getText(profile_url)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
278 elif testElementTag(database_child_node, 'loci'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
279 for locus_node in database_child_node.getElementsByTagName('locus'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
280 locus_info = LocusInfo()
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
281 locus_info.name = getText(locus_node)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
282 for locus_url in locus_node.getElementsByTagName('url'):
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
283 locus_info.url = getText(locus_url)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
284 info.loci.append(locus_info)
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
285
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
286 return info
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
287 else:
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
288 return None
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
289
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
290 # --------------------------------------------------------------------------------------------------
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
291
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
292 if __name__ == '__main__':
25d4d9f313a0 Uploaded
ulfschaefer
parents:
diff changeset
293 main()