Mercurial > repos > frogs > data_manager_frogs
changeset 0:7403d6c4f510 draft default tip
"planemo upload for repository https://github.com/geraldinepascal/FROGS-wrappers/ commit 2024a13846ea6f9bd94ae62e3b2a5a3aba8cd304"
author | frogs |
---|---|
date | Mon, 23 Aug 2021 10:21:10 +0000 (2021-08-23) |
parents | |
children | |
files | data_manager/FROGS_data_manager.py data_manager/FROGS_data_manager.xml data_manager_conf.xml static/images/FROGS_db.png static/images/FROGS_logo.png tool-data/frogs_db.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 8 files changed, 357 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/FROGS_data_manager.py Mon Aug 23 10:21:10 2021 +0000 @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright (C) 2021 INRA +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +__author__ = 'David Christiany Migale Jouy en Josas / Maria Bernard - Sigenae Jouy en Josas' +__copyright__ = 'Copyright (C) 2020 INRAE' +__license__ = 'GNU General Public License' +__version__ = '3.2.3' +__email__ = 'frogs-support@inrae.fr' +__status__ = 'prod' + +# import json +import argparse +import os +# import sys +import tarfile +import time +import urllib + +from galaxy.util.json import from_json_string, to_json_string + +import requests + +# GALAXY_database=~/galaxy/galaxy-20.09/database +# FROGS_data_manager.py --database=frogs_db_data --all_dbs=false \ +# --date=0 --amplicons=16S --bases=SILVA --filters=Pintail100 \ +# --only_last_versions=true \ +# --tool_data=/home/maria/galaxy/galaxy-20.09/tool-data \ +# --output $GALAXY_database/objects/e/7/7/dataset_e7766c39-8f36-450c-adf5-3e4ee8d5c562.dat + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--database") + parser.add_argument("--all_dbs") + parser.add_argument("--date") + parser.add_argument("--amplicons") + parser.add_argument("--bases") + parser.add_argument("--filters") + parser.add_argument("--only_last_versions") + parser.add_argument("--tool_data") + parser.add_argument("-o", "--output") + args = parser.parse_args() + + return args + + +def _add_data_table_entry(data_manager_dict, data_table_entry, data_table): + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) + data_manager_dict['data_tables'][data_table].append(data_table_entry) + return data_manager_dict + + +def keep_only_last_version(db_index): + db_dict = dict() + for line in db_index: + db_type = "_".join(line[1:4]) if line[3] != "" else "_".join(line[1:3]) + if db_type not in db_dict: + db_dict[db_type] = line + return list(db_dict.values()) + + +def frogs_sources(data_manager_dict, target_directory): + + # variables + amplicons_list = [] + bases_list = [] + filters_list = [] + if args.all_dbs == "false": + amplicons_list = [amplicon.lower().strip() for amplicon in args.amplicons.split(",") if amplicon != ""] + bases_list = [base.lower().strip() for base in args.bases.split(",") if base != ""] + filters_list = [filter.lower().strip() for filter in args.filters.split(",") if filter != ""] + bottom_date = int(args.date) + tool_data_path = args.tool_data + + # get frogs database index + frogs_db_index_link = "http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv" + with requests.Session() as s: + download = s.get(frogs_db_index_link) + decoded_content = download.content.decode('utf-8') + db_index = decoded_content.splitlines() + db_index = [line.split("\t") for line in db_index[1:]] + db_index = [[line[0], line[1].lower(), line[2].lower(), line[3].lower()] + line[4:] for line in db_index] + + # filter databases + if args.all_dbs == "false": + # filter by amplicons + if len(amplicons_list) != 0: + db_index = [line for line in db_index if any([amplicon in amplicons_list for amplicon in line[1].split(',')])] + # filter by base + if len(bases_list) != 0: + db_index = [line for line in db_index if line[2] in bases_list] + # filter by filters + if len(filters_list) != 0: + db_index = [line for line in db_index if line[3] in filters_list] + # filter by date + if bottom_date != 0: + db_index = [line for line in db_index if int(line[0]) >= bottom_date] + if args.only_last_versions == "true": + # keep only last version + db_index = keep_only_last_version(db_index) + + # get frogs dbs + os.chdir(target_directory) + dir_name = "frogs_db_" + time.strftime("%Y%m%d") + os.mkdir(dir_name) + dbs = set([]) + for line in db_index: + value = line[5] + name = value.replace("_", " ") if "_" not in line[4] else value.replace(line[4], "").replace("_", " ") + line[4] + link = line[6] + name_dir = "".join([line[6].replace(".tar.gz", "").split("/")[-1]]) + file_path = tool_data_path + "/frogs_db/" + name_dir + if not os.path.exists(file_path): # if the file is not already in frogs_db directory + + # download frogs db + dl_file = urllib.request.URLopener() + dl_file.retrieve(link, "tmp.tar.gz") + + # unzip frogs db + with tarfile.open("tmp.tar.gz") as tar: + tar.extractall(dir_name) + tar.close() + os.remove('tmp.tar.gz') + + # get fasta file path + tmp = set(os.listdir(dir_name)) + new_db = dir_name + "/" + "".join(tmp.difference(dbs)) + files = os.listdir(new_db) + fasta = "".join([file for file in files if file.endswith('.fasta')]) + path = new_db + '/' + fasta + dbs = os.listdir(dir_name) + # release = value + "_" + time.strftime("%Y-%m-%d") + # date = time.strftime("%Y%m%d") + path = os.path.join(target_directory, path) + + data_table_entry = dict(name=name, value=value, path=path) + _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_db") + +# def HVL_sources(data_manager_dict,target_directory): +# HVL_dir = "http://genoweb.toulouse.inra.fr/frogs_databanks/HVL/ITS/UNITE_s_7.1_20112016" +# os.chdir(target_directory) +# for link in [HVL_dir + "/Unite_s_7.1_20112016_ITS1.fasta",HVL_dir + "/Unite_s_7.1_20112016_ITS2.fasta"]: +# file_name=link.split("/")[-1].replace('.fasta',"_"+time.strftime("%Y-%m-%d")+".fasta") +# dl_file = urllib.URLopener() +# dl_file.retrieve(link,file_name) + +# #get fasta file path +# path = os.path.join(target_directory,file_name) +# if link.endswith('ITS1.fasta'): +# name = "UNITE 7.1 ITS1 " + time.strftime("%Y-%m-%d") +# elif link.endswith('ITS2.fasta'): +# name = "UNITE 7.1 ITS2 " + time.strftime("%Y-%m-%d") +# value=file_name.replace('.fasta','') + +# data_table_entry = dict(name = name, value = value, path=path) +# _add_data_table_entry(data_manager_dict, data_table_entry, "frogs_HVL_db") + + +def main(): + + # get args from command line + global args + args = get_args() + + # Extract json file params + data_manager_dict = {} + filename = args.output + params = from_json_string(open(filename).read()) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + # if args.database=="frogs_db_data": + frogs_sources(data_manager_dict, target_directory) + # elif args.database=="HVL_db_data": + # HVL_sources(data_manager_dict,target_directory) + + # save info to json file + open(filename, 'wt').write(to_json_string(data_manager_dict)) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/FROGS_data_manager.xml Mon Aug 23 10:21:10 2021 +0000 @@ -0,0 +1,75 @@ +<tool id="FROGS_data_manager" name="FROGS Data manager" version="3.2.3+galaxy2" tool_type="manage_data"> + <requirements> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + $__tool_directory__/FROGS_data_manager.py + --database="frogs_db_data" + + --all_dbs="$db_type.db" + #if $db_type.db=="false" + --date="$db_type.date" + --amplicons="$db_type.amplicons" + --bases="$db_type.bases" + --filters="$db_type.filters" + #end if + --only_last_versions="$only_last_versions" + + --tool_data="$__tool_data_path__" + --output "$output" + ]]></command> + <inputs> + <conditional name="db_type"> + <param name="db" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Download all databases"/> + <when value="true"/> + <when value="false"> + <param name="date" value="0" type="text" label="Download only most recent databases" help="Select databases uploaded later than this date. Please enter a date at the following format: YYYYMMDD, else leave 0."> + <validator type="regex" message="Please enter a date at the following format: YYYYMMDD, leave 0 for no date">0|[1-2]{1}[0-9]{3}[0-1]{1}[0-9]{1}([0-2]{1}[0-9]{1}|3[0-1]{1})</validator> + </param> + <param name="amplicons" type="text" label='Select amplicon specific databases' help='Write amplicons names separated by ","; example: "COI,ITS,16S" or "23S"'/> + <param name="bases" type="text" label='Select database by name' help='Write base names separated by ","; example: "SILVA,PR2,MIDAS" or "BOLD"'/> + <param name="filters" type="text" label='Select database on filter name' help='Write filter names separated by ",";example: "Pintail100,Fungi"'/> + </when> + </conditional> + <param name="only_last_versions" type="boolean" checked="true" label="Download only the last version of selected database"/> + </inputs> + <outputs> + <data name="output" format="data_manager_json"/> + </outputs> + <help> + + .. image:: static/images/FROGS_logo.png + :height: 144 + :width: 110 + + FROGS datamanager allows to download preformated databases for FROGS Affiliation OTU tool. + + All databases are available at http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/. + + This tools is based on the `FROGS_databases.tsv.txt <http://genoweb.toulouse.inra.fr/frogs_databanks/assignation/FROGS_databases.tsv>`_, that we update each time a new database is formatted. + + .. image:: static/images/FROGS_db.png + :height: 157 + :width: 961 + + You may download all databases, but you may (should) filter whished database on different criteria: + + - on a date, to download only last formated databases + - on an amplicon type + - on a base name + - eventually on a filtered name, this may be the case for example, for 16S SILVA database, for which we provide reference sequence with pintail score above a threshold + + Last option allow you to download only last version of previously selected databases, indeed we provide reference database since 2016 with for example, around 1 version of SILVA per year. + + **Contact** + + Contacts: frogs-support@inrae.fr + + Repositories: https://github.com/geraldinepascal/FROGS, https://github.com/geraldinepascal/FROGS-wrappers + + Website: http://frogs.toulouse.inrae.fr/ + + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Mon Aug 23 10:21:10 2021 +0000 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/FROGS_data_manager.xml" id="frogs_data_manager"> + <data_table name="frogs_db"> + <output> + <column name="value" /> + <column name="name" /> + <column name="path" output_ref="output"> + <move type="directory" relativize_symlinks="False"> + <source>#echo "/".join(str($path).split('/')[:-1])#</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">frogs_db/#echo str($path).split('/')[-2]#/</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/frogs_db/#echo "/".join(str($path).split('/')[-2:])#</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/frogs_db.loc.sample Mon Aug 23 10:21:10 2021 +0000 @@ -0,0 +1,53 @@ +# Copyright (C) 2014 INRA +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# +#This is a sample file that enables tools FROGS_affiliations_OTU to use taxonomy database for +#taxonomy affiliation. You will need to create or download Blast+ index and train your database +#for RDP classifier these data files. +#download link : http://genoweb.toulouse.inra.fr/frogs_databanks/assignation +#Finally you will need to create frogs_db.loc file similar to this one in your galaxy +#tool-data directory.The frogs_db.loc file has this format (longer white space characters are +#TAB characters): +# +#<unique_database_name> <file_path> +# +#First column will be the visible name in galaxy. +#So, for example, if you had 16S silva 128 indexed stored in +#/galaxy_databanks/16S/silva_128/ +#then the frogs_db.loc entry would look like this: +# +#silva 128 16S /galaxy_databanks/16S/silva_128/silva_128_16S.fasta +# +#and your /galaxy_databanks/16S/silva_128/ directory +#would contain index files: +# +#-rw-r--r-- 1 mbernard FROGS 8097966 5 déc. 16:56 bergeyTrainingTree.xml +#-rw-r--r-- 1 mbernard FROGS 1572981589 5 déc. 16:56 genus_wordConditionalProbList.txt +#-rw-r--r-- 1 mbernard FROGS 1654 5 déc. 16:56 LICENCE.txt +#-rw-r--r-- 1 mbernard FROGS 1072228 5 déc. 16:56 logWordPrior.txt +#-rw-r--r-- 1 mbernard FROGS 940834335 5 déc. 16:56 silva_128_16S.fasta +#-rw-r--r-- 1 mbernard FROGS 152606489 5 déc. 16:56 silva_128_16S.fasta.nhr +#-rw-r--r-- 1 mbernard FROGS 6918588 5 déc. 16:56 silva_128_16S.fasta.nin +#-rw-r--r-- 1 mbernard FROGS 205320030 5 déc. 16:56 silva_128_16S.fasta.nsq +#-rw-r--r-- 1 mbernard FROGS 281 5 déc. 16:56 silva_128_16S.fasta.properties +#-rw-r--r-- 1 mbernard FROGS 3420464 5 déc. 16:56 silva_128_16S.tax +#-rw-r--r-- 1 mbernard FROGS 964048 5 déc. 16:57 wordConditionalProbIndexArr.txt +# +# +#<name> <name> <file_path> +# +# EXAMPLE FOR TEST : +#ITS1_test ITS1_test ${__HERE__}/frogs_db_data/ITS1.rdp.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Aug 23 10:21:10 2021 +0000 @@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tables> + <!-- Location of frogs database files --> + <table name="frogs_db" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/frogs_db.loc" /> + </table> +</tables>