Previous changeset 0:45755942ae7b (2017-03-14) Next changeset 2:0a9ffebba65d (2017-06-26) |
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6 |
modified:
data_manager/customProDB_annotation.R data_manager/customProDB_annotation.xml |
added:
ensembl_datasets.loc.sample tool-data/update_ensembl_datasets.R |
b |
diff -r 45755942ae7b -r 9b4ee836e35b data_manager/customProDB_annotation.R --- a/data_manager/customProDB_annotation.R Tue Mar 14 14:11:55 2017 -0400 +++ b/data_manager/customProDB_annotation.R Thu Jun 08 10:55:08 2017 -0400 |
[ |
b'@@ -12,10 +12,13 @@\n \n suppressPackageStartupMessages(library("optparse"))\n suppressPackageStartupMessages(library("RGalaxy"))\n+suppressPackageStartupMessages(library("GetoptLong"))\n \n \n option_list <- list()\n option_list$dbkey <- make_option(\'--dbkey\', type=\'character\')\n+option_list$ensembl_host <- make_option(\'--ensembl_host\', type=\'character\')\n+option_list$ensembl_dataset <- make_option(\'--ensembl_dataset\', type=\'character\')\n option_list$dbsnp <- make_option(\'--dbsnp\', type=\'character\')\n option_list$cosmic <- make_option(\'--cosmic\', type=\'logical\')\n option_list$outputFile <- make_option(\'--outputFile\', type=\'character\')\n@@ -25,20 +28,29 @@\n \n \n customProDB_annotation <- function(\n-\tdbkey = GalaxyCharacterParam(required=TRUE), \n+\tdbkey = GalaxyCharacterParam(required=FALSE), \n+\tensembl_host = GalaxyCharacterParam(required=FALSE), \n+\tensembl_dataset = GalaxyCharacterParam(required=FALSE), \n \tdbsnp_str = GalaxyCharacterParam(required=FALSE), \n \tcosmic = GalaxyLogicalParam(required=FALSE), \n \tdbkey_description = GalaxyCharacterParam(required=FALSE), \n \toutputFile = GalaxyOutput("output","json"))\n {\n+ options(stringsAsFactors = FALSE, gsubfn.engine = "R")\n+\n if (!file.exists(outputFile))\n {\n gstop("json params file does not exist")\n }\n \n- if (length(dbkey_description) < 1)\n+ if (length(dbkey)+length(ensembl_dataset)+length(ensembl_host) == 0)\n {\n- dbkey_description = dbkey\n+ gstop("one of the genome annotation sources must be specified; either dbkey or host and dataset")\n+ }\n+ else if (length(dbkey) > 0 &&\n+ (length(ensembl_dataset) > 0 || length(ensembl_host) > 0))\n+ {\n+ gstop("only one genome annotation source can be specified; either dbkey or host and dataset")\n }\n \n if (length(dbsnp_str) > 0)\n@@ -53,7 +65,8 @@\n use_cosmic = FALSE\n if (length(cosmic) > 0)\n {\n- if (grepl("^hg", dbkey))\n+ if (length(dbkey) > 0 && grepl("^hg", dbkey) ||\n+ length(ensembl_dataset) > 0 && grepl("^hsapiens", ensembl_dataset))\n {\n use_cosmic = TRUE\n }\n@@ -76,26 +89,96 @@\n gstop("failed to remove json params file after reading")\n })\n \n- ucscTableCodingFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgSeq.cdsExon=on&hgSeq.granularity=gene&hgSeq.casing=exon&hgSeq.repMasking=lower&hgta_doGenomicDna=get+sequence&hgta_group=genes&hgta_track=refGene&hgta_table=refGene&hgta_regionType=genome", sep="")\n- ucscTableProteinFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgta_geneSeqType=protein&hgta_doGenePredSequence=submit&hgta_track=refGene&hgta_table=refGene", sep="")\n- codingFastaFilepath = paste(target_directory, "/", dbkey, ".cds.fa", sep="")\n- proteinFastaFilepath = paste(target_directory, "/", dbkey, ".protein.fa", sep="")\n+ # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github)\n+ download.file("https://github.com/chambm/customProDB/archive/c57e5498392197bc598a18c26acb70d7530a921cc57e5498.zip", "customProDB.zip", quiet=TRUE)\n+ unzip("customProDB.zip")\n+ devtools::load_all("customProDB-c57e5498392197bc598a18c26acb70d7530a921c")\n \n- suppressPackageStartupMessages(library(customProDB))\n+ #suppressPackageStartupMessages(library(customProDB))\n options(timeout=3600)\n \n- cat(paste("Downloading coding FASTA from:", ucscTableCodingFastaURL, "\\n"))\n- download.file(ucscTableCodingFastaURL, codingFastaFilepath, quiet=T, mode=\'wb\')\n+ # download protein and coding sequences for UCSC annotation\n+ if (length(dbkey) > 0)\n+ {\n+ proteinFastaFilepath = paste(dbkey, ".protein.fa", sep="")\n+\n+ cat(paste("Downloading protein FASTA from:", getProteinFastaUrlFromUCSC(dbkey), "\\n"))\n+ download.file(getProteinFastaUrlFromUCSC(dbkey), proteinFastaFilepath, quiet=T, mode=\'wb\')\n+\n+ local_cache_path = paste0("customProDB_annota'..b')\n+ }\n+\n+ cat(paste("Preparing Refseq annotation files\\n"))\n+ PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path)\n \n- cat(paste("Downloading protein FASTA from:", ucscTableProteinFastaURL, "\\n"))\n- download.file(ucscTableProteinFastaURL, proteinFastaFilepath, quiet=T, mode=\'wb\')\n+ if (length(dbkey_description) < 1)\n+ {\n+ dbkey_description = dbkey\n+ }\n+ }\n+ else\n+ {\n+ local_cache_path = paste0("customProDB_annotation_", ensembl_dataset, "_", ensembl_host)\n+\n+ suppressPackageStartupMessages(library(biomaRt))\n+ cat(paste("Preparing Ensembl annotation files\\n"))\n+ ensembl_mart = useMart("ENSEMBL_MART_ENSEMBL", dataset=ensembl_dataset, host=ensembl_host)\n+ PrepareAnnotationEnsembl(mart=ensembl_mart, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path)\n+\n+ metadata = sqldf::sqldf("SELECT value FROM metadata WHERE name=\'BioMart database version\' OR name=\'BioMart dataset description\' OR name=\'BioMart dataset version\'",\n+ dbname=file.path(target_directory, "txdb.sqlite"))\n+ version = metadata$value[1] # Ensembl Genes 87\n+ assembly = metadata$value[3]\n+ dbkey = paste0(ensembl_dataset, "_", sub(".*?(\\\\d+)", "\\\\1", version, perl=TRUE))\n \n- cat(paste("Preparing Refseq annotation files\\n"))\n- customProDB::PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic)\n- \n- outputPath = paste(dbkey, "/customProDB", sep="")\n+ # convert Ensembl chromosome names to UCSC for Galaxy compatibility\n+ chromosomeMappingsBaseUrl = "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master"\n+ assemblyNoGrcPatch = sub("(\\\\S+?)(\\\\.p\\\\S+)?$", "\\\\1", assembly, perl=TRUE)\n+ chromosomeMappingsUrl = qq("@{chromosomeMappingsBaseUrl}/@{assemblyNoGrcPatch}_ensembl2UCSC.txt")\n+ if (RCurl::url.exists(chromosomeMappingsUrl))\n+ {\n+ cat(qq("Converting Ensembl chromosome names from: @{chromosomeMappingsUrl}\\n"))\n+ e2u = read.delim(chromosomeMappingsUrl, header=FALSE, col.names=c("ensembl", "ucsc"))\n+ e2u = setNames(as.list(e2u$ucsc), e2u$ensembl)\n+ load(file.path(target_directory, "exon_anno.RData"))\n+ exon$chromosome_name = sapply(exon$chromosome_name, function(x) e2u[[as.character(x)]])\n+ exon = exon[nzchar(exon$chromosome_name), ] # omit genome patches with no mapping\n+ save(exon, file=file.path(target_directory, "exon_anno.RData"))\n+ }\n+ else\n+ {\n+ gwarning(qq("unable to convert Ensembl chromosome names to UCSC; mapping file @{assemblyNoGrcPatch}_ensembl2UCSC.txt does not exist"))\n+ }\n+\n+ if (length(dbkey_description) < 1)\n+ {\n+ dbkey_description = qq("@{ensembl_dataset} (@{version}) (@{assembly})")\n+ }\n+ }\n+\n+ qualified_dbkey = dbkey\n+\n+ if (length(dbsnp_str) > 0 && nzchar(dbsnp_str))\n+ {\n+ qualified_dbkey = qq("@{qualified_dbkey}_db@{dbsnp_str}")\n+ dbkey_description = qq("@{dbkey_description} (db@{dbsnp_str})")\n+ }\n+\n+ if (length(cosmic) > 0)\n+ {\n+ qualified_dbkey = qq("@{qualified_dbkey}_cosmic")\n+ dbkey_description = qq("@{dbkey_description} (COSMIC)")\n+ }\n+\n+ outputPath = paste0(qualified_dbkey, "/customProDB")\n output = list(data_tables = list())\n- output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=dbkey, value=dbkey)\n+ output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=qualified_dbkey, value=qualified_dbkey)\n write(toJSON(output), file=outputFile)\n }\n \n' |
b |
diff -r 45755942ae7b -r 9b4ee836e35b data_manager/customProDB_annotation.xml --- a/data_manager/customProDB_annotation.xml Tue Mar 14 14:11:55 2017 -0400 +++ b/data_manager/customProDB_annotation.xml Thu Jun 08 10:55:08 2017 -0400 |
[ |
@@ -1,7 +1,25 @@ -<tool id="custom_pro_db_annotation_data_manager" name="CustomProDB Annotation" tool_type="manage_data" version="0.0.1"> +<tool id="custom_pro_db_annotation_data_manager" name="CustomProDB Annotation" tool_type="manage_data" version="1.16.1.0"> <description>builder</description> <requirements> - <requirement type="package" version="1.14.0">bioconductor-customprodb</requirement> + <requirement type="package" version="3.3.1">r-base</requirement> + <!--<requirement type="package" version="1.14.0">bioconductor-customprodb</requirement>--> + <requirement type="package" version="1.18.0">bioconductor-rgalaxy</requirement> + <requirement type="package" version="1.21.0">bioconductor-biocinstaller</requirement> + <requirement type="package" version="1.20.3">bioconductor-variantannotation</requirement> + <requirement type="package" version="1.26.4">bioconductor-genomicfeatures</requirement> + <requirement type="package" version="1.11.1">r-devtools</requirement> + <requirement type="package" version="3.98_1.4">r-xml</requirement> + <requirement type="package" version="0.10.11">r-rmysql</requirement> + <requirement type="package" version="1.0.2">r-testthat</requirement> + <requirement type="package" version="0.1.0">r-getoptlong</requirement> + <requirement type="package" version="1.1.2">r-stringi</requirement> + <requirement type="package" version="1.1.0">r-stringr</requirement> + <requirement type="package" version="1.10.0">r-data.table</requirement> + <requirement type="package" version="0.4_10">r-sqldf</requirement> + <requirement type="package" version="0.6_6">r-gsubfn</requirement> + <requirement type="package" version="2.3_47">r-chron</requirement> + <requirement type="package" version="0.3_10">r-proto</requirement> + <requirement type="package" version="1.8.4">r-plyr</requirement> </requirements> <stdio> <exit_code range=":-1" /> @@ -9,17 +27,43 @@ </stdio> <command><![CDATA[ Rscript --vanilla '$__tool_directory__/customProDB_annotation.R' - --outputFile '${out_file}' - --dbkey '${dbkey}' - --dbsnp '${dbsnp}' - $cosmic - --dbkey_description '${ dbkey.get_display_text() }' - 2>1 + --outputFile '${out_file}' + + #if str($transcriptome_annotation.source) == 'refseq': + --dbkey '${transcriptome_annotation.dbkey}' + --dbkey_description '${ transcriptome_annotation.dbkey.get_display_text().strip("\"'") }' + #else: + --ensembl_dataset '${transcriptome_annotation.ensembl_dataset.fields.dataset}' + --ensembl_host '${transcriptome_annotation.ensembl_dataset.fields.host}' + --dbkey_description '${transcriptome_annotation.ensembl_dataset.fields.name}' + #end if + + --dbsnp '${dbsnp}' + $cosmic + 2>&1 ]]> </command> <inputs> - <param type="genomebuild" name="dbkey" value="" label="UCSC dbKey for reference genome" /> - <param type="text" name="dbsnp" value="" label="dbSNP identifier currently available from UCSC" help="e.g. 'snp142'" /> + <conditional name="transcriptome_annotation"> + <param name="source" type="select" label="What source do you want to use for mapping between genes, transcripts, and proteins?" help="RefSeq transcripts are like NM_xxxx, Ensembl transcripts are like ENSTxxxx"> + <option value="refseq">Annotate transcriptome with RefSeq (from NCBI/UCSC)</option> + <option value="ensembl">Annotate transcriptome with Ensembl (from Biomart)</option> + </param> + <when value="refseq"> + <param type="genomebuild" name="dbkey" value="" label="UCSC dbKey for reference genome" help="e.g. hg38, hg19, mm10, canFam31" /> + </when> + <when value="ensembl"> + <param type="select" name="ensembl_dataset" value="" label="Ensembl reference genome identifier"> + <options from_file="ensembl_datasets.loc"> + <column name="value" index="2" /> + <column name="dataset" index="0" /> + <column name="host" index="1" /> + <validator type="no_options" message="Ensembl dataset list not loaded"/> + </options> + </param> + </when> + </conditional> + <param type="text" name="dbsnp" value="" label="dbSNP identifier (select organisms only, e.g. human, mouse, cow)" help="e.g. snp142" /> <param type="boolean" name="cosmic" truevalue="--cosmic true" falsevalue="" label="Annotate somatic SNPs from COSMIC (human only)" /> </inputs> <outputs> @@ -29,8 +73,12 @@ .. class:: infomark -**Notice:** If you leave name, description, or id blank, it will be generated automatically. +This data manager creates the transcriptome annotation in the RData format needed by customProDB. +Two annotation sources are supported: UCSC and Ensembl. +Note that because UCSC's table browser only provides current gene annotations for a given genome assembly, +only the Ensembl annotation is entirely reproducible, i.e. running again with the same settings next month will create the same annotation. +Ensembl chromosome names (1,2, ...) are converted to UCSC format (chr1,chr2, ...) to ease integration with other Galaxy tools. </help> <citations> <citation type="doi">10.1093/bioinformatics/btt543</citation> |
b |
diff -r 45755942ae7b -r 9b4ee836e35b ensembl_datasets.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ensembl_datasets.loc.sample Thu Jun 08 10:55:08 2017 -0400 |
b |
b'@@ -0,0 +1,113 @@\n+aaegypti_gene_ensembl\tmay2009.archive.ensembl.org\tA.aegypti genes (Ensembl 54 aaegypti) (AaegL1)\n+amelanoleuca_gene_ensembl\tmay2017.archive.ensembl.org\tPanda genes (Ensembl 89 amelanoleuca) (ailMel1)\n+aplatyrhynchos_gene_ensembl\tmay2017.archive.ensembl.org\tDuck genes (Ensembl 89 aplatyrhynchos) (BGI_duck_1.0)\n+acarolinensis_gene_ensembl\tmay2017.archive.ensembl.org\tAnole lizard genes (Ensembl 89 acarolinensis) (AnoCar2.0)\n+acarolinensis_gene_ensembl\tmay2009.archive.ensembl.org\tAnole lizard genes (Ensembl 54 acarolinensis) (AnoCar1.0)\n+agambiae_gene_ensembl\tmay2009.archive.ensembl.org\tMosquito genes (Ensembl 54 agambiae) (AgamP3)\n+amexicanus_gene_ensembl\tmay2017.archive.ensembl.org\tCave fish genes (Ensembl 89 amexicanus) (AstMex102)\n+btaurus_gene_ensembl\tmay2017.archive.ensembl.org\tCow genes (Ensembl 89 btaurus) (UMD3.1)\n+btaurus_gene_ensembl\tmay2009.archive.ensembl.org\tCow genes (Ensembl 54 btaurus) (Btau_4.0)\n+celegans_gene_ensembl\tmay2017.archive.ensembl.org\tC.elegans genes (Ensembl 89 celegans) (WBcel235)\n+celegans_gene_ensembl\tmay2012.archive.ensembl.org\tC.elegans genes (Ensembl 67 celegans) (WBcel215)\n+celegans_gene_ensembl\tmay2009.archive.ensembl.org\tC.elegans genes (Ensembl 54 celegans) (WS190)\n+cjacchus_gene_ensembl\tmay2017.archive.ensembl.org\tMarmoset genes (Ensembl 89 cjacchus) (C_jacchus3.2.1)\n+cfamiliaris_gene_ensembl\tmay2017.archive.ensembl.org\tDog genes (Ensembl 89 cfamiliaris) (CanFam3.1)\n+cfamiliaris_gene_ensembl\tmay2012.archive.ensembl.org\tDog genes (Ensembl 67 cfamiliaris) (CanFam 2.0)\n+tsyrichta_gene_ensembl\tmay2017.archive.ensembl.org\tTarsier genes (Ensembl 89 tsyrichta) (tarSyr1)\n+cporcellus_gene_ensembl\tmay2017.archive.ensembl.org\tGuinea Pig genes (Ensembl 89 cporcellus) (cavPor3)\n+csabaeus_gene_ensembl\tmay2017.archive.ensembl.org\tVervet-AGM genes (Ensembl 89 csabaeus) (ChlSab1.1)\n+choffmanni_gene_ensembl\tmay2017.archive.ensembl.org\tSloth genes (Ensembl 89 choffmanni) (choHof1)\n+cintestinalis_gene_ensembl\tmay2017.archive.ensembl.org\tC.intestinalis genes (Ensembl 89 cintestinalis) (KH)\n+cintestinalis_gene_ensembl\tmay2009.archive.ensembl.org\tC.intestinalis genes (Ensembl 54 cintestinalis) (JGI 2)\n+csavignyi_gene_ensembl\tmay2017.archive.ensembl.org\tC.savignyi genes (Ensembl 89 csavignyi) (CSAV 2.0)\n+drerio_gene_ensembl\tmay2017.archive.ensembl.org\tZebrafish genes (Ensembl 89 drerio) (GRCz10)\n+drerio_gene_ensembl\tmar2015.archive.ensembl.org\tZebrafish genes (Ensembl 79 drerio) (Zv9)\n+drerio_gene_ensembl\tmay2009.archive.ensembl.org\tZebrafish genes (Ensembl 54 drerio) (Zv8)\n+dnovemcinctus_gene_ensembl\tmay2017.archive.ensembl.org\tArmadillo genes (Ensembl 89 dnovemcinctus) (Dasnov3.0)\n+dnovemcinctus_gene_ensembl\tmay2012.archive.ensembl.org\tArmadillo genes (Ensembl 67 dnovemcinctus) (dasNov2)\n+dnovemcinctus_gene_ensembl\tmay2009.archive.ensembl.org\tArmadillo genes (Ensembl 54 dnovemcinctus) (ARMA)\n+dordii_gene_ensembl\tmay2017.archive.ensembl.org\tKangaroo rat genes (Ensembl 89 dordii) (dipOrd1)\n+dmelanogaster_gene_ensembl\tmay2017.archive.ensembl.org\tFly genes (Ensembl 89 dmelanogaster) (BDGP6)\n+dmelanogaster_gene_ensembl\tdec2014.archive.ensembl.org\tFly genes (Ensembl 78 dmelanogaster) (BDGP 5)\n+dmelanogaster_gene_ensembl\tmay2009.archive.ensembl.org\tFly genes (Ensembl 54 dmelanogaster) (BDGP 5.4)\n+etelfairi_gene_ensembl\tmay2017.archive.ensembl.org\tTenrec genes (Ensembl 89 etelfairi) (TENREC)\n+ecaballus_gene_ensembl\tmay2017.archive.ensembl.org\tHorse genes (Ensembl 89 ecaballus) (Equ Cab 2)\n+eeuropaeus_gene_ensembl\tmay2017.archive.ensembl.org\tHedgehog genes (Ensembl 89 eeuropaeus) (eriEur1)\n+fcatus_gene_ensembl\tmay2017.archive.ensembl.org\tCat genes (Ensembl 89 fcatus) (Felis_catus_6.2)\n+fcatus_gene_ensembl\tmay2012.archive.ensembl.org\tCat genes (Ensembl 67 fcatus) (CAT)\n+falbicollis_gene_ensembl\tmay2017.archive.ensembl.org\tCollared flycatcher genes (Ensembl 89 falbicollis) (FicAlb_1.4)\n+gmorhua_gene_ensembl\tmay2017.archive.ensembl.org\tCod genes (Ensembl 89 gmorhua) (gadMor1)\n+ggallus_gene_ensembl\tmay201'..b'(myoLuc1)\n+nleucogenys_gene_ensembl\tmay2017.archive.ensembl.org\tGibbon genes (Ensembl 89 nleucogenys) (Nleu1.0)\n+meugenii_gene_ensembl\tmay2017.archive.ensembl.org\tWallaby genes (Ensembl 89 meugenii) (Meug_1.0)\n+oprinceps_gene_ensembl\tmay2017.archive.ensembl.org\tPika genes (Ensembl 89 oprinceps) (OchPri2.0-Ens)\n+oprinceps_gene_ensembl\tmar2017.archive.ensembl.org\tPika genes (Ensembl 88 oprinceps) (OchPri2.0)\n+oniloticus_gene_ensembl\tmay2017.archive.ensembl.org\tTilapia genes (Ensembl 89 oniloticus) (Orenil1.0)\n+oanatinus_gene_ensembl\tmay2017.archive.ensembl.org\tPlatypus genes (Ensembl 89 oanatinus) (OANA5)\n+ocuniculus_gene_ensembl\tmay2017.archive.ensembl.org\tRabbit genes (Ensembl 89 ocuniculus) (OryCun2.0)\n+ocuniculus_gene_ensembl\tmay2009.archive.ensembl.org\tRabbit genes (Ensembl 54 ocuniculus) (RABBIT)\n+olatipes_gene_ensembl\tmay2017.archive.ensembl.org\tMedaka genes (Ensembl 89 olatipes) (HdrR)\n+ogarnettii_gene_ensembl\tmay2017.archive.ensembl.org\tBushbaby genes (Ensembl 89 ogarnettii) (OtoGar3)\n+ogarnettii_gene_ensembl\tmay2009.archive.ensembl.org\tBushbaby genes (Ensembl 54 ogarnettii) (otoGar1)\n+oaries_gene_ensembl\tmay2017.archive.ensembl.org\tSheep genes (Ensembl 89 oaries) (Oar_v3.1)\n+ptroglodytes_gene_ensembl\tmay2017.archive.ensembl.org\tChimp genes (Ensembl 89 ptroglodytes) (CHIMP2.1.4)\n+ptroglodytes_gene_ensembl\tmay2009.archive.ensembl.org\tChimp genes (Ensembl 54 ptroglodytes) (CHIMP2.1)\n+panubis_gene_ensembl\tmay2017.archive.ensembl.org\tOlive Baboon genes (Ensembl 89 panubis) (PapAnu2.0)\n+psinensis_gene_ensembl\tmay2017.archive.ensembl.org\tChinese softshell turtle genes (Ensembl 89 psinensis) (PelSin_1.0)\n+pmarinus_gene_ensembl\tmay2017.archive.ensembl.org\tLamprey genes (Ensembl 89 pmarinus) (Pmarinus_7.0)\n+pformosa_gene_ensembl\tmay2017.archive.ensembl.org\tAmazon molly genes (Ensembl 89 pformosa) (Poecilia_formosa-5.1.2)\n+pabelii_gene_ensembl\tmay2017.archive.ensembl.org\tOrangutan genes (Ensembl 89 pabelii) (PPYG2)\n+pcapensis_gene_ensembl\tmay2017.archive.ensembl.org\tRock hyrax genes (Ensembl 89 pcapensis) (proCap1)\n+pvampyrus_gene_ensembl\tmay2017.archive.ensembl.org\tMegabat genes (Ensembl 89 pvampyrus) (pteVam1)\n+rnorvegicus_gene_ensembl\tmay2017.archive.ensembl.org\tRat genes (Ensembl 89 rnorvegicus) (Rnor_6.0)\n+rnorvegicus_gene_ensembl\tmar2015.archive.ensembl.org\tRat genes (Ensembl 79 rnorvegicus) (Rnor_5.0)\n+rnorvegicus_gene_ensembl\tmay2012.archive.ensembl.org\tRat genes (Ensembl 67 rnorvegicus) (RGSC 3.4)\n+scerevisiae_gene_ensembl\tmay2017.archive.ensembl.org\tS.cerevisiae genes (Ensembl 89 scerevisiae) (R64-1-1)\n+scerevisiae_gene_ensembl\tdec2013.archive.ensembl.org\tS.cerevisiae genes (Ensembl 74 scerevisiae) (EF 4)\n+scerevisiae_gene_ensembl\tmay2009.archive.ensembl.org\tS.cerevisiae genes (Ensembl 54 scerevisiae) (SGD1.01)\n+sharrisii_gene_ensembl\tmay2017.archive.ensembl.org\tTasmanian Devil genes (Ensembl 89 sharrisii) (Devil_ref v7.0)\n+saraneus_gene_ensembl\tmay2017.archive.ensembl.org\tShrew genes (Ensembl 89 saraneus) (sorAra1)\n+sscrofa_gene_ensembl\tmay2017.archive.ensembl.org\tPig genes (Ensembl 89 sscrofa) (Sscrofa10.2)\n+tguttata_gene_ensembl\tmay2017.archive.ensembl.org\tZebra finch genes (Ensembl 89 tguttata) (taeGut3.2.4)\n+trubripes_gene_ensembl\tmay2017.archive.ensembl.org\tFugu genes (Ensembl 89 trubripes) (FUGU 4.0)\n+tnigroviridis_gene_ensembl\tmay2017.archive.ensembl.org\tTetraodon genes (Ensembl 89 tnigroviridis) (TETRAODON 8.0)\n+tbelangeri_gene_ensembl\tmay2017.archive.ensembl.org\tTree Shrew genes (Ensembl 89 tbelangeri) (tupBel1)\n+ttruncatus_gene_ensembl\tmay2017.archive.ensembl.org\tDolphin genes (Ensembl 89 ttruncatus) (turTru1)\n+vpacos_gene_ensembl\tmay2017.archive.ensembl.org\tAlpaca genes (Ensembl 89 vpacos) (vicPac1)\n+xtropicalis_gene_ensembl\tmay2017.archive.ensembl.org\tXenopus genes (Ensembl 89 xtropicalis) (JGI 4.2)\n+xtropicalis_gene_ensembl\tmay2009.archive.ensembl.org\tXenopus genes (Ensembl 54 xtropicalis) (JGI 4.1)\n+xmaculatus_gene_ensembl\tmay2017.archive.ensembl.org\tPlatyfish genes (Ensembl 89 xmaculatus) (Xipmac4.4.2)\n' |
b |
diff -r 45755942ae7b -r 9b4ee836e35b tool-data/update_ensembl_datasets.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/update_ensembl_datasets.R Thu Jun 08 10:55:08 2017 -0400 |
[ |
@@ -0,0 +1,58 @@ +## +## Run this script to update the table of Ensembl assemblies available in the customProDB annotation data manager (ensembl_datasets.loc) +## + +library(RMySQL) +library(httr) +library(biomaRt) +library(stringdist) + +con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous") +archives = dbGetQuery(con, "SHOW DATABASES LIKE 'ensembl_archive_%'") +dbDisconnect(con) + +latestArchive = tail(archives[,1], 1) +con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous", dbname=latestArchive) +assemblies = dbGetQuery(con, "SELECT s.name, s.common_name, rs.assembly_name, MAX(rs.release_id) AS latest_release, r.date + FROM species as s, release_species as rs, ens_release as r + WHERE s.species_id = rs.species_id AND r.release_id = rs.release_id AND r.online = 'Y' + AND r.release_id < 10000 -- ignore 10075 (the special GRCh37 site) + GROUP BY rs.assembly_name + ORDER BY s.common_name, rs.release_id") +allReleases = assemblies$latest_release +uniqueReleases = unique(allReleases) + +# Get the <MMMYYYY> style archive link for each Ensembl release +urlRedirectMap = sapply(paste0("e", uniqueReleases, ".ensembl.org"), function(url){XML::parseURI(HEAD(url)$url)$server}) + +## NOTE ## Make sure the following line is updated to the latest Ensembl mirror +assemblies$url = sub("www.", "may2017.archive.", urlRedirectMap[paste0("e", allReleases, ".ensembl.org")], fixed=TRUE) + +# Get all datasets from the archives +datasets = c() +for (archive in unique(assemblies$url)) { + datasets = unique(c(datasets, listDatasets(useMart("ensembl", host=archive))$dataset)) +} +datasets = sub("_gene_ensembl", "", datasets, fixed=TRUE) + +# Match the assembly species names to the datasets (using amatch() because of cases like Mustela_putorius_furo -> mfuro) +assemblies$dataset_id = datasets[amatch(tolower(assemblies$name), datasets, maxDist=3, method="osa", weight=c(0.1, 1, 1, 1))] + +# Remove mouse strains (would need to add these from ENSEMBL_MOUSE_MART) +assemblies = assemblies[-grep("Mus_musculus_\\S+", assemblies$name, perl=TRUE),] + +# Remove unmatched assemblies (e.g. Mus spretus) +assemblies = assemblies[-which(is.na(assemblies$dataset_id)),] + +# Replace underscores in scientific name +assemblies$name = gsub("_", " ", assemblies$name, fixed=TRUE) + +# Sort assemblies first by scientific name, then descending by latest release for that assembly +assemblies = assemblies[order(assemblies$name, -assemblies$latest_release),] + +# Write dataset table (3 columns: dataset_id, host, description) +dataset_id = paste0(assemblies$dataset_id, "_gene_ensembl") +host = paste0(assemblies$url) +description = paste0(assemblies$common_name, " genes (Ensembl ", assemblies$latest_release, " ", assemblies$dataset_id, + ") (", assemblies$assembly_name, ")") +write.csv(paste(dataset_id, host, description, sep="\t"), file="ensembl_datasets.loc.sample") |