comparison data_manager/FROGS_data_manager.py @ 0:7403d6c4f510 draft default tip

"planemo upload for repository https://github.com/geraldinepascal/FROGS-wrappers/ commit 2024a13846ea6f9bd94ae62e3b2a5a3aba8cd304"
author frogs
date Mon, 23 Aug 2021 10:21:10 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7403d6c4f510
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright (C) 2021 INRA
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 #
19
20 __author__ = 'David Christiany Migale Jouy en Josas / Maria Bernard - Sigenae Jouy en Josas'
21 __copyright__ = 'Copyright (C) 2020 INRAE'
22 __license__ = 'GNU General Public License'
23 __version__ = '3.2.3'
24 __email__ = 'frogs-support@inrae.fr'
25 __status__ = 'prod'
26
27 # import json
28 import argparse
29 import os
30 # import sys
31 import tarfile
32 import time
33 import urllib
34
35 from galaxy.util.json import from_json_string, to_json_string
36
37 import requests
38
39 # GALAXY_database=~/galaxy/galaxy-20.09/database
40 # FROGS_data_manager.py --database=frogs_db_data --all_dbs=false \
41 # --date=0 --amplicons=16S --bases=SILVA --filters=Pintail100 \
42 # --only_last_versions=true \
43 # --tool_data=/home/maria/galaxy/galaxy-20.09/tool-data \
44 # --output $GALAXY_database/objects/e/7/7/dataset_e7766c39-8f36-450c-adf5-3e4ee8d5c562.dat
45
46
47 def get_args():
48 parser = argparse.ArgumentParser()
49 parser.add_argument("-d", "--database")
50 parser.add_argument("--all_dbs")
51 parser.add_argument("--date")
52 parser.add_argument("--amplicons")
53 parser.add_argument("--bases")
54 parser.add_argument("--filters")
55 parser.add_argument("--only_last_versions")
56 parser.add_argument("--tool_data")
57 parser.add_argument("-o", "--output")
58 args = parser.parse_args()
59
60 return args
61
62
63 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table):
64 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
65 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
66 data_manager_dict['data_tables'][data_table].append(data_table_entry)
67 return data_manager_dict
68
69
70 def keep_only_last_version(db_index):
71 db_dict = dict()
72 for line in db_index:
73 db_type = "_".join(line[1:4]) if line[3] != "" else "_".join(line[1:3])
74 if db_type not in db_dict:
75 db_dict[db_type] = line
76 return list(db_dict.values())
77
78
79 def frogs_sources(data_manager_dict, target_directory):
80
81 # variables
82 amplicons_list = []
83 bases_list = []
84 filters_list = []
85 if args.all_dbs == "false":
86 amplicons_list = [amplicon.lower().strip() for amplicon in args.amplicons.split(",") if amplicon != ""]
87 bases_list = [base.lower().strip() for base in args.bases.split(",") if base != ""]
88 filters_list = [filter.lower().strip() for filter in args.filters.split(",") if filter != ""]
89 bottom_date = int(args.date)
90 tool_data_path = args.tool_data
91
92 # get frogs database index
93 frogs_db_index_link = "http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv"
94 with requests.Session() as s:
95 download = s.get(frogs_db_index_link)
96 decoded_content = download.content.decode('utf-8')
97 db_index = decoded_content.splitlines()
98 db_index = [line.split("\t") for line in db_index[1:]]
99 db_index = [[line[0], line[1].lower(), line[2].lower(), line[3].lower()] + line[4:] for line in db_index]
100
101 # filter databases
102 if args.all_dbs == "false":
103 # filter by amplicons
104 if len(amplicons_list) != 0:
105 db_index = [line for line in db_index if any([amplicon in amplicons_list for amplicon in line[1].split(',')])]
106 # filter by base
107 if len(bases_list) != 0:
108 db_index = [line for line in db_index if line[2] in bases_list]
109 # filter by filters
110 if len(filters_list) != 0:
111 db_index = [line for line in db_index if line[3] in filters_list]
112 # filter by date
113 if bottom_date != 0:
114 db_index = [line for line in db_index if int(line[0]) >= bottom_date]
115 if args.only_last_versions == "true":
116 # keep only last version
117 db_index = keep_only_last_version(db_index)
118
119 # get frogs dbs
120 os.chdir(target_directory)
121 dir_name = "frogs_db_" + time.strftime("%Y%m%d")
122 os.mkdir(dir_name)
123 dbs = set([])
124 for line in db_index:
125 value = line[5]
126 name = value.replace("_", " ") if "_" not in line[4] else value.replace(line[4], "").replace("_", " ") + line[4]
127 link = line[6]
128 name_dir = "".join([line[6].replace(".tar.gz", "").split("/")[-1]])
129 file_path = tool_data_path + "/frogs_db/" + name_dir
130 if not os.path.exists(file_path): # if the file is not already in frogs_db directory
131
132 # download frogs db
133 dl_file = urllib.request.URLopener()
134 dl_file.retrieve(link, "tmp.tar.gz")
135
136 # unzip frogs db
137 with tarfile.open("tmp.tar.gz") as tar:
138 tar.extractall(dir_name)
139 tar.close()
140 os.remove('tmp.tar.gz')
141
142 # get fasta file path
143 tmp = set(os.listdir(dir_name))
144 new_db = dir_name + "/" + "".join(tmp.difference(dbs))
145 files = os.listdir(new_db)
146 fasta = "".join([file for file in files if file.endswith('.fasta')])
147 path = new_db + '/' + fasta
148 dbs = os.listdir(dir_name)
149 # release = value + "_" + time.strftime("%Y-%m-%d")
150 # date = time.strftime("%Y%m%d")
151 path = os.path.join(target_directory, path)
152
153 data_table_entry = dict(name=name, value=value, path=path)
154 _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db")
155
156 # def HVL_sources(data_manager_dict,target_directory):
157 # HVL_dir = "http://genoweb.toulouse.inra.fr/frogs_databanks/HVL/ITS/UNITE_s_7.1_20112016"
158 # os.chdir(target_directory)
159 # for link in [HVL_dir + "/Unite_s_7.1_20112016_ITS1.fasta",HVL_dir + "/Unite_s_7.1_20112016_ITS2.fasta"]:
160 # file_name=link.split("/")[-1].replace('.fasta',"_"+time.strftime("%Y-%m-%d")+".fasta")
161 # dl_file = urllib.URLopener()
162 # dl_file.retrieve(link,file_name)
163
164 # #get fasta file path
165 # path = os.path.join(target_directory,file_name)
166 # if link.endswith('ITS1.fasta'):
167 # name = "UNITE 7.1 ITS1 " + time.strftime("%Y-%m-%d")
168 # elif link.endswith('ITS2.fasta'):
169 # name = "UNITE 7.1 ITS2 " + time.strftime("%Y-%m-%d")
170 # value=file_name.replace('.fasta','')
171
172 # data_table_entry = dict(name = name, value = value, path=path)
173 # _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_HVL_db")
174
175
176 def main():
177
178 # get args from command line
179 global args
180 args = get_args()
181
182 # Extract json file params
183 data_manager_dict = {}
184 filename = args.output
185 params = from_json_string(open(filename).read())
186 target_directory = params['output_data'][0]['extra_files_path']
187 os.mkdir(target_directory)
188
189 # if args.database=="frogs_db_data":
190 frogs_sources(data_manager_dict, target_directory)
191 # elif args.database=="HVL_db_data":
192 # HVL_sources(data_manager_dict,target_directory)
193
194 # save info to json file
195 open(filename, 'wt').write(to_json_string(data_manager_dict))
196
197
198 if __name__ == "__main__":
199 main()