Mercurial > repos > ulfschaefer > data_manager_phemost
comparison data_manager/fetch_mlst_data.py @ 0:25d4d9f313a0 draft default tip
Uploaded
| author | ulfschaefer |
|---|---|
| date | Wed, 13 Jul 2016 05:50:48 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:25d4d9f313a0 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ''' | |
| 4 Download MLST datasets from this site: http://pubmlst.org/data/ by | |
| 5 parsing an xml file (http://pubmlst.org/data/dbases.xml). | |
| 6 | |
| 7 Data is downloaded for a species determined by the user: | |
| 8 - profiles (maps STs to allele numbers) | |
| 9 - numbered sequences for each locus in the scheme | |
| 10 | |
| 11 In addition, the alleles are concatenated together for use with SRST2. | |
| 12 | |
| 13 A log file is also generated in the working directory, detailing the | |
| 14 time, date and location of all files downloaded, as well as the <retrieved> | |
| 15 tag which tells us when the XML entry was last updated. | |
| 16 | |
| 17 If the species name input by the user matches multiple <species> in the | |
| 18 xml file, the script simply reports the possible matches so the user can | |
| 19 try again. | |
| 20 ''' | |
| 21 | |
| 22 """ | |
| 23 - Remove empty line at the end of profiles.txt file. | |
| 24 - Ensure the allele names at the profiles.txt file don't contain "_". | |
| 25 | |
| 26 """ | |
| 27 from argparse import ArgumentParser | |
| 28 import xml.dom.minidom as xml | |
| 29 import urllib2 as url | |
| 30 import re | |
| 31 import os | |
| 32 import sys | |
| 33 import glob | |
| 34 import csv | |
| 35 import shutil | |
| 36 from urlparse import urlparse | |
| 37 import time | |
| 38 import subprocess | |
| 39 from json import dumps | |
| 40 from json import loads | |
| 41 | |
| 42 # -------------------------------------------------------------------------------------------------- | |
| 43 | |
| 44 def parse_args(): | |
| 45 parser = ArgumentParser(description='Download MLST datasets by species' | |
| 46 'from pubmlst.org.') | |
| 47 | |
| 48 parser.add_argument('--repository_url', | |
| 49 metavar = 'URL', | |
| 50 default = 'http://pubmlst.org/data/dbases.xml', | |
| 51 help = 'URL for MLST repository XML index') | |
| 52 | |
| 53 parser.add_argument('--species', | |
| 54 metavar = 'NAME', | |
| 55 required = True, | |
| 56 help = 'The name of the species that you want to download (e.g. "Escherichia coli")') | |
| 57 | |
| 58 parser.add_argument('--outfile', | |
| 59 metavar = 'FILE', | |
| 60 required = True, | |
| 61 help = 'The name of the Json file to write that galaxy stuff to.') | |
| 62 | |
| 63 parser.add_argument('--reference', | |
| 64 metavar = 'ACCESSION', | |
| 65 required = True, | |
| 66 help = 'NCBI accession number of the reference genome to use for flanking regions.') | |
| 67 | |
| 68 return parser.parse_args() | |
| 69 | |
| 70 # -------------------------------------------------------------------------------------------------- | |
| 71 | |
| 72 def main(): | |
| 73 | |
| 74 """ | |
| 75 <species> | |
| 76 Achromobacter spp. | |
| 77 <mlst> | |
| 78 <database> | |
| 79 <url>http://pubmlst.org/achromobacter</url> | |
| 80 <retrieved>2015-08-11</retrieved> | |
| 81 <profiles> | |
| 82 <count>272</count> | |
| 83 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url> | |
| 84 </profiles> | |
| 85 <loci> | |
| 86 <locus> | |
| 87 nusA | |
| 88 <url> | |
| 89 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa | |
| 90 </url> | |
| 91 </locus> | |
| 92 <locus> | |
| 93 rpoB | |
| 94 <url> | |
| 95 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa | |
| 96 </url> | |
| 97 </locus> | |
| 98 """ | |
| 99 | |
| 100 args = parse_args() | |
| 101 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml | |
| 102 | |
| 103 doc = xml.parse(docFile) | |
| 104 root = doc.childNodes[0] | |
| 105 found_species = [] | |
| 106 | |
| 107 if args.species == "Escherichia coli": | |
| 108 args.species = "Escherichia coli#1" | |
| 109 elif args.species == "Acinetobacter baumannii": | |
| 110 args.species = "Acinetobacter baumannii#1" | |
| 111 elif args.species == "Pasteurella multocida": | |
| 112 args.species = "Pasteurella multocida#1" | |
| 113 else: | |
| 114 pass | |
| 115 | |
| 116 for species_node in root.getElementsByTagName('species'): | |
| 117 info = getSpeciesInfo(species_node, args.species) | |
| 118 if info != None: | |
| 119 found_species.append(info) | |
| 120 | |
| 121 if len(found_species) == 0: | |
| 122 sys.stderr.write("No species matched your query.\n") | |
| 123 exit(1) | |
| 124 | |
| 125 if len(found_species) > 1: | |
| 126 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species))) | |
| 127 for info in found_species: | |
| 128 sys.stderr.write(info.name + '\n') | |
| 129 exit(2) | |
| 130 | |
| 131 # output information for the single matching species | |
| 132 assert len(found_species) == 1 | |
| 133 species_info = found_species[0] | |
| 134 species_name_underscores = species_info.name.replace(' ', '_') | |
| 135 timestamp = time.strftime("%Y%m%d%H%M%S") | |
| 136 | |
| 137 params = loads(open(args.outfile).read()) | |
| 138 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp) | |
| 139 | |
| 140 if not os.path.isdir(folder): | |
| 141 os.makedirs(folder) | |
| 142 | |
| 143 profile_doc = url.urlopen(species_info.profiles_url) | |
| 144 with open(os.path.join(folder, 'profiles.txt'), 'w') as f: | |
| 145 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt'))) | |
| 146 for line in profile_doc.readlines(): | |
| 147 cols = line.split("\t") | |
| 148 f.write("%s\n" % ('\t'.join(cols[0:8]))) | |
| 149 profile_doc.close() | |
| 150 | |
| 151 for locus in species_info.loci: | |
| 152 locus_path = urlparse(locus.url).path | |
| 153 locus_filename = locus_path.split('/')[-1] | |
| 154 locus_filename = locus_filename.replace("_.tfa", ".fas") | |
| 155 locus_filename = locus_filename.replace("tfa", "fas") | |
| 156 locus_doc = url.urlopen(locus.url) | |
| 157 with open(os.path.join(folder, locus_filename), 'w') as locus_file: | |
| 158 locus_fasta_content = locus_doc.read() | |
| 159 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-") | |
| 160 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename))) | |
| 161 locus_file.write(locus_fasta_content) | |
| 162 locus_doc.close() | |
| 163 | |
| 164 get_reference(folder, args.reference) | |
| 165 | |
| 166 | |
| 167 # do Galaxy stuff | |
| 168 data_manager_dict = {} | |
| 169 data_manager_dict['data_tables'] = {} | |
| 170 name = "%s-%s" % (species_info.name, timestamp) | |
| 171 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores, | |
| 172 dbkey=species_name_underscores, | |
| 173 name=name, | |
| 174 time_stamp=timestamp, | |
| 175 file_path=folder)] | |
| 176 #save info to json file | |
| 177 with open(args.outfile, 'wb') as fjson: | |
| 178 fjson.write(dumps(data_manager_dict)) | |
| 179 | |
| 180 # end of main -------------------------------------------------------------------------------------- | |
| 181 | |
| 182 def get_reference(folder, acc): | |
| 183 | |
| 184 # We're getting this file from Japan! | |
| 185 # It seems to work pretty well until they take down or change their website | |
| 186 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643 | |
| 187 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc) | |
| 188 remote_ref = url.urlopen(refurl) | |
| 189 ref_filename = os.path.join(folder, 'reference.seq') | |
| 190 with open(ref_filename, 'wb') as fRef: | |
| 191 fRef.write(remote_ref.read()) | |
| 192 remote_ref.close() | |
| 193 | |
| 194 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \ | |
| 195 % (ref_filename, ref_filename.replace("reference.seq", "reference")) | |
| 196 p = subprocess.Popen(cmd, | |
| 197 shell=True, | |
| 198 stdin=None, | |
| 199 stdout=subprocess.PIPE, | |
| 200 stderr=subprocess.PIPE, close_fds=True) | |
| 201 p.wait() | |
| 202 | |
| 203 return | |
| 204 | |
| 205 # -------------------------------------------------------------------------------------------------- | |
| 206 | |
| 207 # test if a node is an Element and that it has a specific tag name | |
| 208 def testElementTag(node, name): | |
| 209 return node.nodeType == node.ELEMENT_NODE and node.localName == name | |
| 210 | |
| 211 # -------------------------------------------------------------------------------------------------- | |
| 212 | |
| 213 # Get the text from an element node with a text node child | |
| 214 def getText(element): | |
| 215 result = '' | |
| 216 for node in element.childNodes: | |
| 217 if node.nodeType == node.TEXT_NODE: | |
| 218 result += node.data | |
| 219 return normaliseText(result) | |
| 220 | |
| 221 # -------------------------------------------------------------------------------------------------- | |
| 222 | |
| 223 # remove unwanted whitespace including linebreaks etc. | |
| 224 def normaliseText(str): | |
| 225 return ' '.join(str.split()) | |
| 226 | |
| 227 # -------------------------------------------------------------------------------------------------- | |
| 228 | |
| 229 # A collection of interesting information about a taxa | |
| 230 class SpeciesInfo(object): | |
| 231 def __init__(self): | |
| 232 self.name = None # String name of species | |
| 233 self.database_url = None # URL as string | |
| 234 self.retrieved = None # date as string | |
| 235 self.profiles_url = None # URL as string | |
| 236 self.profiles_count = None # positive integer | |
| 237 self.loci = [] # list of loci | |
| 238 | |
| 239 def __str__(self): | |
| 240 s = "Name: %s\n" % self.name | |
| 241 s += "Database URL: %s\n" % self.database_url | |
| 242 s += "Retrieved: %s\n" % self.retrieved | |
| 243 s += "Profiles URL: %s\n" % self.profiles_url | |
| 244 s += "Profiles count: %s\n" % self.profiles_count | |
| 245 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci])) | |
| 246 return s | |
| 247 | |
| 248 # -------------------------------------------------------------------------------------------------- | |
| 249 | |
| 250 class LocusInfo(object): | |
| 251 def __init__(self): | |
| 252 self.url = None | |
| 253 self.name = None | |
| 254 def __str__(self): | |
| 255 return "Locus: name:%s,url:%s" % (self.name, self.url) | |
| 256 | |
| 257 # -------------------------------------------------------------------------------------------------- | |
| 258 | |
| 259 # retrieve the interesting information for a given sample element | |
| 260 def getSpeciesInfo(species_node, species): | |
| 261 this_name = getText(species_node) | |
| 262 print this_name | |
| 263 if this_name.startswith(species): | |
| 264 info = SpeciesInfo() | |
| 265 info.name = this_name | |
| 266 for mlst_node in species_node.getElementsByTagName('mlst'): | |
| 267 for database_node in mlst_node.getElementsByTagName('database'): | |
| 268 for database_child_node in database_node.childNodes: | |
| 269 if testElementTag(database_child_node, 'url'): | |
| 270 info.database_url = getText(database_child_node) | |
| 271 elif testElementTag(database_child_node, 'retrieved'): | |
| 272 info.retrieved = getText(database_child_node) | |
| 273 elif testElementTag(database_child_node, 'profiles'): | |
| 274 for profile_count in database_child_node.getElementsByTagName('count'): | |
| 275 info.profiles_count = getText(profile_count) | |
| 276 for profile_url in database_child_node.getElementsByTagName('url'): | |
| 277 info.profiles_url = getText(profile_url) | |
| 278 elif testElementTag(database_child_node, 'loci'): | |
| 279 for locus_node in database_child_node.getElementsByTagName('locus'): | |
| 280 locus_info = LocusInfo() | |
| 281 locus_info.name = getText(locus_node) | |
| 282 for locus_url in locus_node.getElementsByTagName('url'): | |
| 283 locus_info.url = getText(locus_url) | |
| 284 info.loci.append(locus_info) | |
| 285 | |
| 286 return info | |
| 287 else: | |
| 288 return None | |
| 289 | |
| 290 # -------------------------------------------------------------------------------------------------- | |
| 291 | |
| 292 if __name__ == '__main__': | |
| 293 main() |
