Galaxy |

Changeset 4:e967a99d66b3 (2020-01-23)

Previous changeset 3:af0250fd023c (2019-09-05) Next changeset 5:b05fa99ddda2 (2020-02-06)

Commit message:
"planemo upload commit 540dd383c0617193db43bf11457011888751b022-dirty"

modified:
data_manager/resource_building.py
data_manager/resource_building.xml
data_manager_conf.xml
tool_data_table_conf.xml.sample

added:
tool-data/proteore_protein_full_atlas.loc.sample

diff -r af0250fd023c -r e967a99d66b3 data_manager/resource_building.py
--- a/data_manager/resource_building.py Thu Sep 05 07:45:16 2019 -0400
+++ b/data_manager/resource_building.py Thu Jan 23 08:38:02 2020 -0500

[

b'@@ -3,7 +3,7 @@\n The purpose of this script is to create source files from different databases to be used in other proteore tools\n """\n \n-import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile\n+import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile, subprocess\n from io import BytesIO\n from zipfile import ZipFile\n from galaxy.util.json import from_json_string, to_json_string\n@@ -131,15 +131,17 @@\n import ftplib, gzip\n csv.field_size_limit(sys.maxsize) # to handle big files\n \n-def id_mapping_sources (data_manager_dict, species, target_directory) :\n+def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :\n \n human = species == "Human"\n species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }\n files=["idmapping_selected.tab.gz","idmapping.dat.gz"]\n+ archive = os.path.join(tool_data_path, "id_mapping/ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d")))\n+ if os.path.isdir(archive) is False : os.mkdir(archive)\n \n #header\n- if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]\n- else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]\n+ if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",\'Gene_Name\']]\n+ else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",\'Gene_Name\']]\n \n #get selected.tab and keep only ids of interest\n selected_tab_file=species_dict[species]+"_"+files[0]\n@@ -147,9 +149,9 @@\n with gzip.open(tab_path,"rt") as select :\n tab_reader = csv.reader(select,delimiter="\\t")\n for line in tab_reader :\n- tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])\n- os.remove(tab_path)\n-\n+ tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])\n+ if os.path.exists(os.path.join(archive,tab_path.split("/")[-1])) : os.remove(os.path.join(archive,tab_path.split("/")[-1]))\n+ shutil.move(tab_path, archive)\n #print("selected_tab ok")\n \n #get uniprot-AC reviewed\n@@ -161,10 +163,16 @@\n decoded_content = download.content.decode(\'utf-8\')\n uniprot_reviewed_list = decoded_content.splitlines()\n \n+ #save reviewed list\n+ reviewed_list_path = os.path.join(archive,\'uniprot_reviewed_list.txt\')\n+ with open(reviewed_list_path,\'w\') as reviewed_list_file:\n+ for id in uniprot_reviewed_list:\n+ reviewed_list_file.write(id+"\\n")\n+\n+ #remove unreviewed uniprot-AC\n for line in tab[1:]:\n- UniProtAC = line[0]\n+ UniProtAC = line[1]\n if UniProtAC not in uniprot_reviewed_list :\n- line[0]=""\n line[1]=""\n \n """\n@@ -173,12 +181,12 @@\n """\n \n #there\'s more id type for human\n- if human : ids = [\'neXtProt\',\'BioGrid\',\'STRING\',\'KEGG\' ] #ids to get from dat_file\n- else : ids = [\'BioGrid\',\'STRING\',\'KEGG\' ]\n+ if human : ids = [\'neXtProt\',\'BioGrid\',\'STRING\',\'KEGG\',\'Gene_Name\' ] #ids to get from dat_file\n+ else : ids = [\'BioGrid\',\'STRING\',\'KEGG\',\'Gene_Name\' ]\n unidict = {}\n \n #keep only ids of interest in dictionaries\n- dat_file=species_dict[species]+"_"+files[1]\n+ dat_file = species_dict[species]+"_"+files[1]\n dat_path = download_from_uniprot_ftp(dat_file,target_directory)\n with gzip.open(dat_path,"rt") as dat :\n dat_reader = csv.reader(dat,delimiter="\\t")\n@@ -194,7 +202,8 @@\n unidict[uniprotID].update({ id'..b'nd(["","","",""])\n+ line.extend(["","","","",""])\n else :\n if uniprotID in unidict :\n line.extend([access_dictionary(unidict,uniprotID,\'BioGrid\'),access_dictionary(unidict,uniprotID,\'STRING\'),\n- access_dictionary(unidict,uniprotID,\'KEGG\')])\n+ access_dictionary(unidict,uniprotID,\'KEGG\'),access_dictionary(unidict,uniprotID,\'Gene_Name\')])\n else :\n- line.extend(["","",""])\n+ line.extend(["","","",""])\n \n #print ("tab ok")\n \n #add missing nextprot ID for human or replace old ones\n if human : \n #build next_dict\n- nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)\n+ nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)\n+ with open(nextprot_path,\'r\') as nextprot_ids :\n+ nextprot_ids = nextprot_ids.read().splitlines()\n+ if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1]))\n+ shutil.move(nextprot_path,archive)\n next_dict = {}\n for nextid in nextprot_ids : \n next_dict[nextid.replace("NX_","")] = nextid\n- os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))\n+ #os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))\n \n #add missing nextprot ID\n for line in tab[1:] : \n uniprotID=line[0]\n- nextprotID=line[13]\n+ nextprotID=line[14]\n if uniprotID in next_dict and (nextprotID == \'\' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :\n- line[13]=next_dict[uniprotID]\n+ line[14]=next_dict[uniprotID]\n \n output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"\n path = os.path.join(target_directory,output_file)\n@@ -240,6 +253,9 @@\n with open(path,"w") as out :\n w = csv.writer(out,delimiter=\'\\t\')\n w.writerows(tab)\n+ \n+ subprocess.call([\'tar\', \'-czvf\', archive+".tar.gz", archive])\n+ shutil.rmtree(archive, ignore_errors=True)\n \n name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}\n name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"\n@@ -267,9 +283,8 @@\n ftp.cwd(ftp_dir)\n ftp.retrbinary("RETR " + file, open(path, \'wb\').write)\n ftp.quit()\n- with open(path,\'r\') as nextprot_ids :\n- nextprot_ids = nextprot_ids.read().splitlines()\n- return (nextprot_ids)\n+\n+ return (path)\n \n #return \'\' if there\'s no value in a dictionary, avoid error\n def access_dictionary (dico,key1,key2) :\n@@ -373,7 +388,7 @@\n elif interactome=="bioplex":\n \n with requests.Session() as s:\n- r = s.get(\'http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv\',verify=False)\n+ r = s.get(\'http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv\')\n r = r.content.decode(\'utf-8\')\n bioplex = csv.reader(r.splitlines(), delimiter=\'\\t\')\n \n@@ -597,6 +612,7 @@\n parser.add_argument("--date")\n parser.add_argument("-o", "--output")\n parser.add_argument("--database")\n+ parser.add_argument("--tool_data_path")\n args = parser.parse_args()\n \n data_manager_dict = {}\n@@ -631,13 +647,13 @@\n \n ## Download ID_mapping source file from Uniprot\n try:\n- id_mapping=args.id_mapping\n+ id_mapping = args.id_mapping\n except NameError:\n id_mapping = None\n if id_mapping is not None:\n id_mapping = id_mapping .split(",")\n for species in id_mapping :\n- id_mapping_sources(data_manager_dict, species, target_directory)\n+ id_mapping_sources(data_manager_dict, species, target_directory, args.tool_data_path)\n \n ## Download PPI ref files from biogrid/bioplex/humap\n try:\n'

diff -r af0250fd023c -r e967a99d66b3 data_manager/resource_building.xml
--- a/data_manager/resource_building.xml Thu Sep 05 07:45:16 2019 -0400
+++ b/data_manager/resource_building.xml Thu Jan 23 08:38:02 2020 -0500

@@ -1,4 +1,4 @@
-<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.09.05" tool_type="manage_data">
+<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2020.01.21" tool_type="manage_data">
<description>
to create or update reference files for proteore tools
</description>
@@ -27,6 +27,7 @@
         --database=$database.database
     #end if
     --output "$output"
+    --tool_data_path=$__tool_data_path__

]]></command>

@@ -43,7 +44,7 @@
             <param name="tissues" type="select" multiple="false" label="Please select tissue">
                 <option value="HPA_normal_tissue">Normal tissue</option>
                 <option value="HPA_pathology">Pathology</option>
-                
+                <option value="HPA_full_atlas">Full Atlas</option>
             </param>
         </when>
         <when value="peptide_atlas">

diff -r af0250fd023c -r e967a99d66b3 data_manager_conf.xml
--- a/data_manager_conf.xml Thu Sep 05 07:45:16 2019 -0400
+++ b/data_manager_conf.xml Thu Jan 23 08:38:02 2020 -0500

@@ -46,6 +46,21 @@
                 </column>
             </output>
         </data_table>
+        <data_table name="proteore_protein_full_atlas">
+            <output>
+                <column name="id"/>
+                <column name="release"/>
+                <column name="name" />
+                <column name="tissue" />
+                <column name="value" output_ref="output" >
+                    <move type="file">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">protein_atlas/</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${release}.tsv</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
         <data_table name="proteore_id_mapping_Human">
             <output>
                 <column name="id" />

diff -r af0250fd023c -r e967a99d66b3 tool-data/proteore_protein_full_atlas.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_protein_full_atlas.loc.sample Thu Jan 23 08:38:02 2020 -0500

@@ -0,0 +1,4 @@
+#This file lists the locations name and values of reference files for Get expression data tool
+#This is a tab separated file (TAB, not 4 spaces !)
+#<id> <release> <name> <tissue> <value>
+#9979819281 HPA_full_atlas_19-07-2018 HPA Full Protein Atlas 19/07/2018 HPA_full_atlas /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_full_atlas_19-07-2018.tsv

diff -r af0250fd023c -r e967a99d66b3 tool_data_table_conf.xml.sample
--- a/tool_data_table_conf.xml.sample Thu Sep 05 07:45:16 2019 -0400
+++ b/tool_data_table_conf.xml.sample Thu Jan 23 08:38:02 2020 -0500

@@ -12,6 +12,10 @@
     <columns>id, release, name, tissue, value</columns>
     <file path="tool-data/proteore_protein_atlas_tumor_tissue.loc" />
   </table>
+  <table name="proteore_protein_full_atlas" comment_char="#">
+    <columns>id, release, name, tissue, value</columns>
+    <file path="tool-data/proteore_protein_full_atlas.loc" />
+  </table>
   <table name="proteore_id_mapping_Human" comment_char="#">
     <columns>id, release, name, species, value</columns>
     <file path="tool-data/proteore_id_mapping_Human.loc" />