comparison data_manager/data_manager_fetch_and_index_maf.py @ 0:de73b258a601 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_index_maf commit 21852ee28cf191d12b1ffe5583efaa5deeb1d80d-dirty"
author dave
date Wed, 15 Jul 2020 14:30:00 -0400
parents
children edf39ed96bc3
comparison
equal deleted inserted replaced
-1:000000000000 0:de73b258a601
1 #!/usr/bin/env python
2 import bz2
3 import ftplib
4 import gzip
5 import json
6 import optparse
7 import os
8 import re
9 import shutil
10 import subprocess
11 import sys
12 import tempfile
13 import urllib.parse
14 import urllib.request
15 import zipfile
16 from binascii import hexlify
17
18 CHUNK_SIZE = 2**20
19
20 DEFAULT_DATA_TABLE_NAME = "indexed_maf_files"
21
22 # Nice solution to opening compressed files (zip/bz2/gz) transparently
23 # https://stackoverflow.com/a/13045892/638445
24
25
26 class CompressedFile(object):
27 magic = None
28 file_type = None
29 mime_type = None
30 proper_extension = None
31
32 def __init__(self, f):
33 # f is an open file or file like object
34 self.f = f
35 self.accessor = self.open()
36
37 @classmethod
38 def is_magic(self, data):
39 return hexlify(data).startswith(hexlify(self.magic))
40
41 def open(self):
42 return None
43
44
45 class ZIPFile(CompressedFile):
46 magic = b'\x50\x4b\x03\x04'
47 file_type = 'zip'
48 mime_type = 'compressed/zip'
49
50 def open(self):
51 return zipfile.ZipFile(self.f)
52
53
54 class BZ2File(CompressedFile):
55 magic = b'\x42\x5a\x68'
56 file_type = 'bz2'
57 mime_type = 'compressed/bz2'
58
59 def open(self):
60 return bz2.BZ2File(self.f)
61
62
63 class GZFile(CompressedFile):
64 magic = b'\x1f\x8b\x08'
65 file_type = 'gz'
66 mime_type = 'compressed/gz'
67
68 def open(self):
69 return gzip.GzipFile(self.f)
70
71
72 # Factory function to create a suitable instance for accessing files
73 def get_compressed_file(filename):
74 with open(filename, 'rb') as f:
75 start_of_file = f.read(16)
76 f.seek(0)
77 for cls in (ZIPFile, BZ2File, GZFile):
78 if cls.is_magic(start_of_file):
79 f.close()
80 return cls(filename)
81
82 return None
83
84
85 def url_download(url, tmp=False, localpath=None):
86 """Attempt to download file from a given url
87 :param url: full url to file
88 :type url: str.
89 :returns: name of downloaded file
90 :raises: ContentDecodingError, IOError
91 """
92
93 # Generate file_name
94 file_name = url.split('/')[-1]
95 if tmp:
96 file_name = os.path.join(tempfile.mkdtemp(), file_name)
97 elif localpath is not None:
98 file_name = os.path.join(localpath, file_name)
99
100 try:
101 # download URL (FTP and HTTP work, probably local and data too)
102 urllib.request.urlretrieve(url, file_name)
103
104 # uncompress file if needed
105 cf = get_compressed_file(file_name)
106 if cf is not None:
107 uncompressed_file_name = os.path.splitext(file_name)[0]
108 with open(uncompressed_file_name, 'wb') as uncompressed_file:
109 shutil.copyfileobj(cf.accessor, uncompressed_file)
110 os.remove(file_name)
111 file_name = uncompressed_file_name
112 except IOError as e:
113 sys.stderr.write('Error occured downloading reference file: %s' % e)
114 os.remove(file_name)
115 return file_name
116
117
118 def generate_metadata(params, options):
119 name = options.name
120 uid = name
121 species = []
122 # Found to be the fastest way to strip non-alphanumeric characters
123 # from a string in some post on StackOverflow
124 pattern = re.compile(r'[\W]+')
125 uid = pattern.sub('_', uid).strip('_')
126 url = options.nexus
127 with open(url_download(url, True), 'r') as fh:
128 species = [line.strip(' (),').split(':')[0] for line in fh.readlines()]
129 return name, uid.upper(), species
130
131
132 def get_maf_listing(maf_path):
133 maf_files = []
134 maf_url = urllib.parse.urlparse(maf_path)
135 f = ftplib.FTP()
136 f.connect(maf_url.netloc)
137 f.login()
138 listing = f.mlsd(maf_url.path)
139 compressions = ['gz', 'bz2', 'zip']
140 for name, facts in listing:
141 skip = False
142 if os.path.splitext(name)[-1].lstrip('.') not in compressions:
143 skip = True
144 if facts['type'] != 'file':
145 skip = True
146 for compression in compressions:
147 for exclusion in ['_alt', '_random']:
148 if name.endswith('%s.maf.%s' % (exclusion, compression)):
149 skip = True
150 break
151 if name.startswith('chrUn'):
152 skip = True
153 if skip:
154 continue
155 maf_files.append(urllib.parse.urljoin(maf_path, name))
156 f.close()
157 return maf_files
158
159
160 def index_maf_files(maf_files, maf_path, options, params, target_directory):
161 for maf_file in maf_files:
162 maf_url = urllib.parse.urljoin(maf_path, maf_file)
163 local_maf = url_download(maf_url, localpath=target_directory)
164 index_command = ['maf_build_index.py', local_maf, local_maf + '.index']
165 executor = subprocess.Popen(index_command)
166 stdout, stderr = executor.communicate()
167
168
169 def main():
170 parser = optparse.OptionParser()
171 parser.add_option('-x', '--nexus', dest='nexus', action='store', type='string', help='URL for .nh')
172 parser.add_option('-a', '--alignments', dest='alignments', action='store', type='string', help='URL for alignments')
173 parser.add_option('-n', '--name', dest='name', action='store', type='string', help='Name')
174 parser.add_option('-o', '--output', dest='output', action='store', type='string', help='Output')
175 parser.add_option('-d', '--dbkey', dest='dbkey', action='store', type='string', help='dbkey')
176 (options, args) = parser.parse_args()
177
178 params = {}
179
180 with open(options.output) as fh:
181 params = json.load(fh)
182 target_directory = params['output_data'][0]['extra_files_path']
183 os.makedirs(target_directory, exist_ok=True)
184
185 display_name, uid, species_list = generate_metadata(params, options)
186 maf_path = urllib.parse.urljoin(options.nexus, 'maf/')
187 maf_files = get_maf_listing(maf_path)
188
189 data_manager_entry = {
190 'data_tables': {
191 'indexed_maf_files': {
192 'name': display_name,
193 'dbkey': options.dbkey, # This is needed for the output path
194 'value': uid,
195 'indexed_for': ','.join(species_list),
196 'exists_in_maf': ','.join(species_list),
197 'path': ','.join([maf_file.split('/')[-1] for maf_file in maf_files]),
198 }
199 }
200 }
201
202 # Fetch and index the MAFs
203 index_maf_files(maf_files, maf_path, options, params, target_directory)
204 with open(options.output, 'w') as fh:
205 fh.write(json.dumps(data_manager_entry))
206
207
208 if __name__ == "__main__":
209 main()