diff data_manager/fetch_mothur_reference_data.py @ 4:0db22932bc39 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit f845716f6ac93500f143a30abef97eaba406344e"
author iuc
date Fri, 25 Jun 2021 09:37:05 +0000
parents 9d09724f2bf1
children
line wrap: on
line diff
--- a/data_manager/fetch_mothur_reference_data.py	Sun Nov 22 12:51:44 2020 +0000
+++ b/data_manager/fetch_mothur_reference_data.py	Fri Jun 25 09:37:05 2021 +0000
@@ -1,6 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
+import io
 import json
 import optparse
 import os
@@ -8,7 +9,9 @@
 import sys
 import tarfile
 import tempfile
-import urllib2
+import urllib.error
+import urllib.parse
+import urllib.request
 import zipfile
 from functools import reduce
 
@@ -38,6 +41,14 @@
     },
     # RDP reference files
     # http://www.mothur.org/wiki/RDP_reference_files
+    "RDP_v18": {
+        "16S rRNA RDP training set 18":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ],
+        "16S rRNA PDS training set 18":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ],
+    },
     "RDP_v16": {
         "16S rRNA RDP training set 16":
         ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ],
@@ -76,6 +87,12 @@
     },
     # Silva reference files
     # http://www.mothur.org/wiki/Silva_reference_files
+    "silva_release_138.1": {
+        "SILVA release 138.1":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz",
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ],
+    },
     "silva_release_128": {
         "SILVA release 128":
         ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz",
@@ -230,14 +247,19 @@
     Returns the name that the file is saved with.
 
     """
-    print("Downloading %s" % url)
+    print(f"Downloading {url}")
     if not target:
         target = os.path.basename(url)
     if wd:
         target = os.path.join(wd, target)
-    print("Saving to %s" % target)
+    print(f"Saving to {target}")
     with open(target, 'wb') as fh:
-        fh.write(urllib2.urlopen(url).read())
+        url_h = urllib.request.urlopen(url)
+        while True:
+            buffer = url_h.read(io.DEFAULT_BUFFER_SIZE)
+            if buffer == b"":
+                break
+            fh.write(buffer)
     return target
 
 
@@ -257,13 +279,13 @@
 
     """
     if not zipfile.is_zipfile(filen):
-        print("%s: not ZIP formatted file")
+        print(f"{filen}: not ZIP formatted file")
         return [filen]
     file_list = []
     with zipfile.ZipFile(filen) as z:
         for name in z.namelist():
             if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
-                print("Ignoring %s" % name)
+                print(f"Ignoring {name}")
                 continue
             if wd:
                 target = os.path.join(wd, name)
@@ -271,14 +293,14 @@
                 target = name
             if name.endswith('/'):
                 # Make directory
-                print("Creating dir %s" % target)
+                print(f"Creating dir {target}")
                 try:
                     os.makedirs(target)
                 except OSError:
                     pass
             else:
                 # Extract file
-                print("Extracting %s" % name)
+                print("Extracting {target}")
                 try:
                     os.makedirs(os.path.dirname(target))
                 except OSError:
@@ -286,7 +308,7 @@
                 with open(target, 'wb') as fh:
                     fh.write(z.read(name))
                 file_list.append(target)
-    print("Removing %s" % filen)
+    print(f"Removing {filen}")
     os.remove(filen)
     return file_list
 
@@ -309,23 +331,23 @@
     """
     file_list = []
     if not tarfile.is_tarfile(filen):
-        print("%s: not TAR file")
+        print(f"{filen}: not TAR file")
         return [filen]
     with tarfile.open(filen) as t:
         for name in t.getnames():
             # Check for unwanted files
             if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
-                print("Ignoring %s" % name)
+                print(f"Ignoring {name}")
                 continue
             # Extract file
-            print("Extracting %s" % name)
+            print(f"Extracting {name}")
             t.extract(name, wd)
             if wd:
                 target = os.path.join(wd, name)
             else:
                 target = name
             file_list.append(target)
-    print("Removing %s" % filen)
+    print(f"Removing {filen}")
     os.remove(filen)
     return file_list
 
@@ -343,9 +365,9 @@
     current working directory.
 
     """
-    print("Unpack %s" % filen)
+    print(f"Unpack {filen}")
     ext = os.path.splitext(filen)[1]
-    print("Extension: %s" % ext)
+    print(f"Extension: {ext}")
     if ext == ".zip":
         return unpack_zip_archive(filen, wd=wd)
     elif ext == ".tgz":
@@ -386,7 +408,7 @@
     try:
         return MOTHUR_FILE_TYPES[ext]
     except KeyError:
-        print("WARNING: unknown file type for " + filen + ", skipping")
+        print(f"WARNING: unknown file type for {filen}, skipping")
         return None
 
 
@@ -419,26 +441,27 @@
     """
     # Make working dir
     wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd())
-    print("Working dir %s" % wd)
+    print(f"Working dir {wd}")
     # Iterate over all requested reference data URLs
     for dataset in datasets:
-        print("Handling dataset '%s'" % dataset)
+        print(f"Handling dataset '{dataset}'")
         for name in MOTHUR_REFERENCE_DATA[dataset]:
             for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd):
                 type_ = identify_type(f)
-                entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name)
-                print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f)))
+                name_from_file = os.path.splitext(os.path.basename(f))[0]
+                entry_name = f"{name_from_file} ({name})"
+                print(f"{type_}\t\'{entry_name}'\t.../{os.path.basename(f)}")
                 if type_ is not None:
                     # Move to target dir
                     ref_data_file = os.path.basename(f)
                     f1 = os.path.join(target_dir, ref_data_file)
-                    print("Moving %s to %s" % (f, f1))
-                    os.rename(f, f1)
+                    print(f"Moving {f} to {f1}")
+                    shutil.move(f, f1)
                     # Add entry to data table
-                    table_name = "mothur_%s" % type_
+                    table_name = f"mothur_{type_}"
                     add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
     # Remove working dir
-    print("Removing %s" % wd)
+    print(f"Removing {wd}")
     shutil.rmtree(wd)
 
 
@@ -454,7 +477,7 @@
     files = []
     for path in paths:
         path = os.path.abspath(path)
-        print("Examining '%s'..." % path)
+        print(f"Examining '{path}'...")
         if os.path.isfile(path):
             # Store full path for file
             files.append(path)
@@ -493,21 +516,21 @@
     for f in files:
         type_ = identify_type(f)
         if type_ is None:
-            print("%s: unrecognised type, skipped" % f)
+            print(f"{f}: unrecognised type, skipped")
             continue
         ref_data_file = os.path.basename(f)
         target_file = os.path.join(target_dir, ref_data_file)
         entry_name = "%s" % os.path.splitext(ref_data_file)[0]
         if description:
             entry_name += " (%s)" % description
-        print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file))
+        print(f"{type_}\t\'{entry_name}'\t.../{ref_data_file}")
         # Link to or copy the data
         if link_to_data:
             os.symlink(f, target_file)
         else:
             shutil.copyfile(f, target_file)
         # Add entry to data table
-        table_name = "mothur_%s" % type_
+        table_name = f"mothur_{type_}"
         add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
 
 
@@ -522,8 +545,8 @@
     parser.add_option('--description', action='store', dest='description', default='')
     parser.add_option('--link', action='store_true', dest='link_to_data')
     options, args = parser.parse_args()
-    print("options: %s" % options)
-    print("args   : %s" % args)
+    print(f"options: {options}")
+    print(f"args   : {args}")
 
     # Check for JSON file
     if len(args) != 1:
@@ -536,7 +559,7 @@
     params, target_dir = read_input_json(jsonfile)
 
     # Make the target directory
-    print("Making %s" % target_dir)
+    print(f"Making {target_dir}")
     os.mkdir(target_dir)
 
     # Set up data tables dictionary