annotate data_manager_amrfinderplus/data_manager/data_manager_amrfinderplus.py @ 0:5ba68abd41f6 draft

Uploaded
author estrain
date Tue, 24 May 2022 11:46:19 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
1 #!/usr/bin/env python
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
2 # Errol Strain, estrain@gmail.com
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
3 # Database downloads for NCBI AMRFinderPlus
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
4
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
5 import sys
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
6 import os
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
7 import tempfile
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
8 import shutil
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
9 import json
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
10 import re
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
11 import argparse
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
12 from ftplib import FTP
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
13
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
14
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
15 def download_from_ncbi(output_directory):
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
16 NCBI_FTP_SERVER = 'ftp.ncbi.nlm.nih.gov'
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
17 FILENAME = 'version.txt'
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
18 NCBI_DOWNLOAD_PATH = '/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/'
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
19
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
20 email = 'anonymous@example.com'
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
21
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
22 cwd = os.getcwd()
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
23 os.chdir(output_directory)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
24
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
25 ftp = FTP( NCBI_FTP_SERVER )
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
26 ftp.login( 'anonymous', email)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
27 ftp.cwd(NCBI_DOWNLOAD_PATH)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
28
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
29 #exclude the allele counts folder
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
30 files = ftp.nlst()
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
31 files = filter(lambda x: re.search(r'^((?!allele|(?:invokername=allele)).)*$', x), files)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
32
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
33 for f in files:
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
34 ftp.retrbinary("RETR " + f, open(f, 'wb').write)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
35
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
36 files = ftp.nlst()
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
37 files = filter(lambda x: re.search(r'^((?!allele|(?:invokername=allele)).)*$', x), files)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
38 pointmuts = filter(lambda x: re.search(r'^((?!tab|(?:invokername=tab)).)*$', x), files)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
39 pointmuts = filter(lambda x: re.search(r'AMR_DNA-', x), pointmuts)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
40
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
41
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
42 # Make blast databases
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
43 blastcmd = "makeblastdb -in AMRProt -dbtype prot -logfile /dev/null"
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
44 os.system(blastcmd)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
45 blastcmd = "makeblastdb -in AMR_CDS -dbtype nucl -logfile /dev/null"
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
46 os.system(blastcmd)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
47
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
48 for f in pointmuts:
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
49 blastcmd = "makeblastdb -in " + f +" -dbtype nucl -logfile /dev/null"
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
50 os.system(blastcmd)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
51
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
52 # Make HMM indexes
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
53 hmmcmd="hmmpress -f AMR.LIB > /dev/null 2> /dev/null"
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
54 os.system(hmmcmd)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
55
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
56 # Read in version
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
57 with open("version.txt") as f:
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
58 version = f.readline().rstrip()
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
59
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
60 ftp.quit()
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
61
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
62 os.chdir(cwd)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
63
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
64 return version
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
65
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
66 def print_json (version,argspath,argsname,argsout):
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
67
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
68 data_table_entry = {
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
69 'data_tables' : {
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
70 'amrfinderplus': [
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
71 {
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
72 "value":version,
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
73 "name":argsname,
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
74 "path":argspath,
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
75 }
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
76 ]
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
77 }
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
78 }
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
79
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
80 with open(argsout, 'w') as fh:
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
81 json.dump(data_table_entry, fh, indent=2, sort_keys=True)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
82
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
83 def main():
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
84
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
85 parser = argparse.ArgumentParser(description='Download NCBI amrFinderPlus Databases')
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
86 parser.add_argument('--name', type=str, required=True, nargs=1, help='Database name')
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
87 parser.add_argument('--out', type=str, required=True, nargs=1, help='output file')
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
88
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
89 args = parser.parse_args()
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
90
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
91 with open(args.out[0]) as fh:
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
92 params = json.load(fh)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
93
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
94 output_directory = params['output_data'][0]['extra_files_path']
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
95 os.mkdir(output_directory)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
96 data_manager_dict = {}
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
97
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
98 #Fetch the files and build blast databases
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
99 version=download_from_ncbi(output_directory)
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
100
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
101 tablename = "AMRFinderPlus Database " + version
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
102
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
103 #shutil.copytree("amrdb",args.path[0])
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
104 print_json(version,output_directory,tablename,args.out[0])
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
105
5ba68abd41f6 Uploaded
estrain
parents:
diff changeset
106 if __name__ == "__main__": main()