diff data_manager/data_manager_fetch_and_index_maf.py @ 0:de73b258a601 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_index_maf commit 21852ee28cf191d12b1ffe5583efaa5deeb1d80d-dirty"
author dave
date Wed, 15 Jul 2020 14:30:00 -0400
parents
children edf39ed96bc3
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_fetch_and_index_maf.py	Wed Jul 15 14:30:00 2020 -0400
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+import bz2
+import ftplib
+import gzip
+import json
+import optparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import urllib.parse
+import urllib.request
+import zipfile
+from binascii import hexlify
+
+CHUNK_SIZE = 2**20
+
+DEFAULT_DATA_TABLE_NAME = "indexed_maf_files"
+
+# Nice solution to opening compressed files (zip/bz2/gz) transparently
+# https://stackoverflow.com/a/13045892/638445
+
+
+class CompressedFile(object):
+    magic = None
+    file_type = None
+    mime_type = None
+    proper_extension = None
+
+    def __init__(self, f):
+        # f is an open file or file like object
+        self.f = f
+        self.accessor = self.open()
+
+    @classmethod
+    def is_magic(self, data):
+        return hexlify(data).startswith(hexlify(self.magic))
+
+    def open(self):
+        return None
+
+
+class ZIPFile(CompressedFile):
+    magic = b'\x50\x4b\x03\x04'
+    file_type = 'zip'
+    mime_type = 'compressed/zip'
+
+    def open(self):
+        return zipfile.ZipFile(self.f)
+
+
+class BZ2File(CompressedFile):
+    magic = b'\x42\x5a\x68'
+    file_type = 'bz2'
+    mime_type = 'compressed/bz2'
+
+    def open(self):
+        return bz2.BZ2File(self.f)
+
+
+class GZFile(CompressedFile):
+    magic = b'\x1f\x8b\x08'
+    file_type = 'gz'
+    mime_type = 'compressed/gz'
+
+    def open(self):
+        return gzip.GzipFile(self.f)
+
+
+# Factory function to create a suitable instance for accessing files
+def get_compressed_file(filename):
+    with open(filename, 'rb') as f:
+        start_of_file = f.read(16)
+        f.seek(0)
+        for cls in (ZIPFile, BZ2File, GZFile):
+            if cls.is_magic(start_of_file):
+                f.close()
+                return cls(filename)
+
+        return None
+
+
+def url_download(url, tmp=False, localpath=None):
+    """Attempt to download file from a given url
+    :param url: full url to file
+    :type url: str.
+    :returns: name of downloaded file
+    :raises: ContentDecodingError, IOError
+    """
+
+    # Generate file_name
+    file_name = url.split('/')[-1]
+    if tmp:
+        file_name = os.path.join(tempfile.mkdtemp(), file_name)
+    elif localpath is not None:
+        file_name = os.path.join(localpath, file_name)
+
+    try:
+        # download URL (FTP and HTTP work, probably local and data too)
+        urllib.request.urlretrieve(url, file_name)
+
+        # uncompress file if needed
+        cf = get_compressed_file(file_name)
+        if cf is not None:
+            uncompressed_file_name = os.path.splitext(file_name)[0]
+            with open(uncompressed_file_name, 'wb') as uncompressed_file:
+                shutil.copyfileobj(cf.accessor, uncompressed_file)
+            os.remove(file_name)
+            file_name = uncompressed_file_name
+    except IOError as e:
+        sys.stderr.write('Error occured downloading reference file: %s' % e)
+        os.remove(file_name)
+    return file_name
+
+
+def generate_metadata(params, options):
+    name = options.name
+    uid = name
+    species = []
+    # Found to be the fastest way to strip non-alphanumeric characters
+    # from a string in some post on StackOverflow
+    pattern = re.compile(r'[\W]+')
+    uid = pattern.sub('_', uid).strip('_')
+    url = options.nexus
+    with open(url_download(url, True), 'r') as fh:
+        species = [line.strip(' (),').split(':')[0] for line in fh.readlines()]
+    return name, uid.upper(), species
+
+
+def get_maf_listing(maf_path):
+    maf_files = []
+    maf_url = urllib.parse.urlparse(maf_path)
+    f = ftplib.FTP()
+    f.connect(maf_url.netloc)
+    f.login()
+    listing = f.mlsd(maf_url.path)
+    compressions = ['gz', 'bz2', 'zip']
+    for name, facts in listing:
+        skip = False
+        if os.path.splitext(name)[-1].lstrip('.') not in compressions:
+            skip = True
+        if facts['type'] != 'file':
+            skip = True
+        for compression in compressions:
+            for exclusion in ['_alt', '_random']:
+                if name.endswith('%s.maf.%s' % (exclusion, compression)):
+                    skip = True
+                    break
+        if name.startswith('chrUn'):
+            skip = True
+        if skip:
+            continue
+        maf_files.append(urllib.parse.urljoin(maf_path, name))
+    f.close()
+    return maf_files
+
+
+def index_maf_files(maf_files, maf_path, options, params, target_directory):
+    for maf_file in maf_files:
+        maf_url = urllib.parse.urljoin(maf_path, maf_file)
+        local_maf = url_download(maf_url, localpath=target_directory)
+        index_command = ['maf_build_index.py', local_maf, local_maf + '.index']
+        executor = subprocess.Popen(index_command)
+        stdout, stderr = executor.communicate()
+
+
+def main():
+    parser = optparse.OptionParser()
+    parser.add_option('-x', '--nexus', dest='nexus', action='store', type='string', help='URL for .nh')
+    parser.add_option('-a', '--alignments', dest='alignments', action='store', type='string', help='URL for alignments')
+    parser.add_option('-n', '--name', dest='name', action='store', type='string', help='Name')
+    parser.add_option('-o', '--output', dest='output', action='store', type='string', help='Output')
+    parser.add_option('-d', '--dbkey', dest='dbkey', action='store', type='string', help='dbkey')
+    (options, args) = parser.parse_args()
+
+    params = {}
+
+    with open(options.output) as fh:
+        params = json.load(fh)
+    target_directory = params['output_data'][0]['extra_files_path']
+    os.makedirs(target_directory, exist_ok=True)
+
+    display_name, uid, species_list = generate_metadata(params, options)
+    maf_path = urllib.parse.urljoin(options.nexus, 'maf/')
+    maf_files = get_maf_listing(maf_path)
+
+    data_manager_entry = {
+        'data_tables': {
+            'indexed_maf_files': {
+                'name': display_name,
+                'dbkey': options.dbkey,  # This is needed for the output path
+                'value': uid,
+                'indexed_for': ','.join(species_list),
+                'exists_in_maf': ','.join(species_list),
+                'path': ','.join([maf_file.split('/')[-1] for maf_file in maf_files]),
+            }
+        }
+    }
+
+    # Fetch and index the MAFs
+    index_maf_files(maf_files, maf_path, options, params, target_directory)
+    with open(options.output, 'w') as fh:
+        fh.write(json.dumps(data_manager_entry))
+
+
+if __name__ == "__main__":
+    main()