annotate config_lookup.py @ 25:2e27fdf82795 draft default tip

Uploaded
author computationaltranscriptomics
date Thu, 02 Apr 2020 12:07:52 -0400
parents ba52692d7a95
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
1 #!/usr/local/bin/python3
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
2
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
3 '''
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
4 This script downloads lookup tables and integrates these into the Galaxy instance
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
5
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
6 USAGE
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
7 config_lookup.py --galaxy GALAXY --acclinks ACCLINKS [--acclists ACCLISTS]
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
8
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
9 OPTIONS
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
10 -h, --help show this help message and exit
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
11
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
12 '''
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
13
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
14 import os
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
15 import argparse
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
16 import requests
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
17 import sys
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
18 import shutil
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
19
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
20 def main():
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
21 # parse arguments
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
22 parser = argparse.ArgumentParser(description='incorporate the accession lists in GLASSgo/Galaxy to enable clade-specific searches')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
23 parser.add_argument('--galaxy', required=True, help='(absolute) path to the root directory of the Galaxy instance')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
24 parser.add_argument('--acclinks', help='(absolute) path to file containing URLs to the accession lists')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
25 parser.add_argument('--acclists', help='(absolute) path to directory to save the accession lists to')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
26 args = parser.parse_args()
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
27
22
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
28 # load taxonomic rank and
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
29 rank = {}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
30 rank["Alphaproteobacteria"] = {"tax": 28211, "rank": "class"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
31 rank["Aquificae"] = {"tax": 200783, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
32 rank["Archaea"] = {"tax": 2157, "rank": "superkingdom"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
33 rank["Armatimonadetes"] = {"tax": 67819, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
34 rank["Bacteria"] = {"tax": 2, "rank": "superkingdom"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
35 rank["Bacteroidetes"] = {"tax": 976, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
36 rank["Caldiserica"] = {"tax": 67814, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
37 rank["Chlamydiae"] = {"tax": 204428, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
38 rank["Chloroflexi"] = {"tax": 200795, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
39 rank["Chrysiogenetes"] = {"tax": 200938, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
40 rank["Cyanobacteria"] = {"tax": 1117, "rank":"phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
41 rank["Deferribacteres"] = {"tax": 200930, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
42 rank["Deinococcus-thermus"] = {"tax": 1297, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
43 rank["Dictyoglomi"] = {"tax": 68297, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
44 rank["Elusimicrobia"] = {"tax": 74152, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
45 rank["Fibrobacteres"] = {"tax": 65842, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
46 rank["Firmicutes"] = {"tax": 1239, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
47 rank["Fusobacteria"] = {"tax": 32066, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
48 rank["Gemmatimonadetes"] = {"tax": 142182, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
49 rank["Nitrospinae"] = {"tax": 1293497, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
50 rank["Nitrospirae"] = {"tax": 40117, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
51 rank["Planctomycetes"] = {"tax": 203682, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
52 rank["Proteobacteria"] = {"tax": 1224, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
53 rank["Spirochaetes"] = {"tax": 203691, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
54 rank["Synergistetes"] = {"tax": 508458, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
55 rank["Tenericutes"] = {"tax": 544448, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
56 rank["Thermodesulfobacteria"] = {"tax": 200940, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
57 rank["Thermotogae"] = {"tax": 200918, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
58 rank["Viruses"] = {"tax": 10239, "rank": "phylum"}
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
59
15
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
60
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
61 # ./accession_lists_links.txt as default
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
62 if args.acclinks == None:
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
63 args.acclinks = os.path.join(os.getcwd(), 'accession_lists_links.txt')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
64
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
65 # ./acclists as default folder for the accession lists
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
66 if args.acclists == None:
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
67 args.acclists = os.path.join(os.getcwd(),'acclists')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
68
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
69 # check for existence of the folders for galaxy and URLs to the accession lists
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
70 if not os.path.exists(args.galaxy):
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
71 print('\tERROR: ' + args.galaxy + ' could not be found!')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
72 sys.exit()
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
73 if not os.path.exists(args.acclinks):
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
74 print('\tERROR: ' + args.acclinks + ' could not be found!')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
75 sys.exit()
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
76
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
77 print('################ configure the accession lists ################')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
78 print('### the accession lists will be saved to ' + args.acclists)
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
79
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
80 # create folder for accession lists
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
81 if not os.path.exists(args.acclists):
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
82 os.makedirs(args.acclists)
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
83
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
84 #
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
85 with open(args.acclinks, 'r') as link:
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
86 # create list with lookup tables that populates the user interface
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
87 accDataTableFile = os.path.join(os.getcwd(),'tool-data/glassgo_accession_list.txt')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
88
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
89 accDataTable = open(accDataTableFile,'w')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
90 accDataTable.write('global\tglobal\n')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
91 # fetch accession lists
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
92 for url in link:
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
93 acc = requests.get(url)
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
94 filename = str(os.path.basename(url)).replace('\n','')
22
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
95 filenameStem = str(os.path.splitext(filename)[0])
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
96
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
97 #filename = str(os.path.basename(url)).replace('\n','')
15
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
98 print('### fetch: ' + filename)
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
99 open(os.path.join(args.acclists,filename),'wb').write(acc.content)
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
100
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
101 #
22
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
102 if filenameStem in rank:
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
103 taxid = str(rank[filenameStem]["tax"])
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
104 rankname = rank[filenameStem]["rank"]
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
105 filenameStem = filenameStem + ' (tax:' + taxid + ', rank:' + rankname + ')'
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
106
ba52692d7a95 Uploaded
computationaltranscriptomics
parents: 15
diff changeset
107 accDataTable.write(filenameStem + '\t')
15
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
108 accDataTable.write(os.path.join(args.acclists,filename) + '\n')
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
109
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
110 accDataTable.close()
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
111 print('### create tab-separated list '+ accDataTableFile)
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
112
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
113 # move list with accession list to /galaxy/tool-data
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
114 shutil.copy(accDataTableFile,os.path.join(args.galaxy,'tool-data/'))
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
115 print('### move tab-separated list to ' + str(os.path.join(args.galaxy,'tool-data/')))
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
116
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
117 #
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
118 if __name__ == "__main__":
4ac32c671a40 Uploaded
computationaltranscriptomics
parents:
diff changeset
119 main()