Mercurial > repos > chrisb > gap_datatypes
comparison datatypes/glycan.py @ 0:0e941a69a6fa draft default tip
Uploaded
| author | chrisb |
|---|---|
| date | Wed, 23 Mar 2016 14:34:50 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0e941a69a6fa |
|---|---|
| 1 __license__ = "MIT" | |
| 2 | |
| 3 import logging | |
| 4 from galaxy.datatypes import metadata | |
| 5 import mimetypes | |
| 6 import os | |
| 7 import shutil | |
| 8 import sys | |
| 9 import traceback | |
| 10 import tempfile | |
| 11 import zipfile | |
| 12 from cgi import escape | |
| 13 from inspect import isclass | |
| 14 import galaxy.util as util | |
| 15 from galaxy.datatypes import data | |
| 16 from galaxy.datatypes.metadata import \ | |
| 17 MetadataElement # import directly to maintain ease of use in Datatype class definitions | |
| 18 from galaxy.util import inflector | |
| 19 from galaxy.util.bunch import Bunch | |
| 20 from galaxy.util.odict import odict | |
| 21 from galaxy.util.sanitize_html import sanitize_html | |
| 22 | |
| 23 from galaxy.datatypes import dataproviders | |
| 24 | |
| 25 from galaxy import eggs | |
| 26 | |
| 27 eggs.require("Paste") | |
| 28 import paste | |
| 29 | |
| 30 | |
| 31 class kcf(data.Data): | |
| 32 file_ext = 'kcf' | |
| 33 line_class = 'line' | |
| 34 | |
| 35 """Add metadata elements""" | |
| 36 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 37 visible=False, no_value=0) | |
| 38 | |
| 39 def write_from_stream(self, dataset, stream): | |
| 40 """Writes data from a stream""" | |
| 41 # write it twice for now | |
| 42 fd, temp_name = tempfile.mkstemp() | |
| 43 while 1: | |
| 44 chunk = stream.read(1048576) | |
| 45 if not chunk: | |
| 46 break | |
| 47 os.write(fd, chunk) | |
| 48 os.close(fd) | |
| 49 # rewrite the file with unix newlines | |
| 50 fp = open(dataset.file_name, 'wt') | |
| 51 for line in file(temp_name, "U"): | |
| 52 line = line.strip() + '\n' | |
| 53 fp.write(line) | |
| 54 fp.close() | |
| 55 | |
| 56 def set_raw_data(self, dataset, data): | |
| 57 """Saves the data on the disc""" | |
| 58 fd, temp_name = tempfile.mkstemp() | |
| 59 os.write(fd, data) | |
| 60 os.close(fd) | |
| 61 # rewrite the file with unix newlines | |
| 62 fp = open(dataset.file_name, 'wt') | |
| 63 for line in file(temp_name, "U"): | |
| 64 line = line.strip() + '\n' | |
| 65 fp.write(line) | |
| 66 fp.close() | |
| 67 os.remove(temp_name) | |
| 68 | |
| 69 def get_mime(self): | |
| 70 """Returns the mime type of the datatype""" | |
| 71 return 'text/plain' | |
| 72 | |
| 73 def set_meta(self, dataset, **kwd): | |
| 74 """ | |
| 75 Set the number of lines of data in dataset. | |
| 76 """ | |
| 77 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 78 | |
| 79 def estimate_file_lines(self, dataset): | |
| 80 """ | |
| 81 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 82 """ | |
| 83 sample_size = 1048576 | |
| 84 dataset_fh = open(dataset.file_name) | |
| 85 dataset_read = dataset_fh.read(sample_size) | |
| 86 dataset_fh.close() | |
| 87 sample_lines = dataset_read.count('\n') | |
| 88 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 89 return est_lines | |
| 90 | |
| 91 def count_data_lines(self, dataset): | |
| 92 """ | |
| 93 Count the number of lines of data in dataset, | |
| 94 skipping all blank lines and comments. | |
| 95 """ | |
| 96 data_lines = 0 | |
| 97 for line in file(dataset.file_name): | |
| 98 line = line.strip() | |
| 99 if line and not line.startswith('#'): | |
| 100 data_lines += 1 | |
| 101 return data_lines | |
| 102 | |
| 103 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 104 """ | |
| 105 Set the peek. This method is used by various subclasses of Text. | |
| 106 """ | |
| 107 if not dataset.dataset.purged: | |
| 108 # The file must exist on disk for the get_file_peek() method | |
| 109 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 110 skipchars=skipchars) | |
| 111 if line_count is None: | |
| 112 # See if line_count is stored in the metadata | |
| 113 if dataset.metadata.data_lines: | |
| 114 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 115 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 116 else: | |
| 117 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 118 # needed to set metadata | |
| 119 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 120 if int(dataset.get_size()) <= 1048576: | |
| 121 # Small dataset, recount all lines and reset peek afterward. | |
| 122 lc = self.count_data_lines(dataset) | |
| 123 dataset.metadata.data_lines = lc | |
| 124 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 125 else: | |
| 126 est_lines = self.estimate_file_lines(dataset) | |
| 127 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 128 inflector.cond_plural(est_lines, self.line_class) ) | |
| 129 else: | |
| 130 dataset.blurb = "%s %s" % ( | |
| 131 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 132 else: | |
| 133 dataset.peek = 'file does not exist' | |
| 134 dataset.blurb = 'file purged from disk' | |
| 135 | |
| 136 def sniff(self, filename): | |
| 137 """All KCF Files simply put a 'ENTRY' in its first line. | |
| 138 This applies to all possible kcfs. In this case check | |
| 139 for 'Glycan' to confirm it's a glycan """ | |
| 140 try: | |
| 141 from suds.client import Client | |
| 142 | |
| 143 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
| 144 client = Client(url) | |
| 145 kcfresponse = client.service.DeterminingForm(file(filename, 'r').read()) | |
| 146 if kcfresponse.array[0] == "KCF": | |
| 147 return True | |
| 148 else: | |
| 149 return False | |
| 150 except ImportError: | |
| 151 # cannot use import suds so use simple checker | |
| 152 print "using KCF simple checker" | |
| 153 f = open(filename, "r") | |
| 154 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
| 155 f.close() | |
| 156 | |
| 157 if "ENTRY" in firstline and "GLYCAN" in firstline: | |
| 158 return True | |
| 159 else: | |
| 160 return False | |
| 161 except Exception, e: | |
| 162 # note I am not raising an error rather return False and let another sniffer try to type this data | |
| 163 traceback.print_exc(file=sys.stdout) | |
| 164 return False | |
| 165 | |
| 166 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 167 """ | |
| 168 Split the input files by line. | |
| 169 """ | |
| 170 if split_params is None: | |
| 171 return | |
| 172 | |
| 173 if len(input_datasets) > 1: | |
| 174 raise Exception("Text file splitting does not support multiple files") | |
| 175 input_files = [ds.file_name for ds in input_datasets] | |
| 176 | |
| 177 lines_per_file = None | |
| 178 chunk_size = None | |
| 179 if split_params['split_mode'] == 'number_of_parts': | |
| 180 lines_per_file = [] | |
| 181 # Computing the length is expensive! | |
| 182 def _file_len(fname): | |
| 183 i = 0 | |
| 184 f = open(fname) | |
| 185 for i, l in enumerate(f): | |
| 186 pass | |
| 187 f.close() | |
| 188 return i + 1 | |
| 189 | |
| 190 length = _file_len(input_files[0]) | |
| 191 parts = int(split_params['split_size']) | |
| 192 if length < parts: | |
| 193 parts = length | |
| 194 len_each, remainder = divmod(length, parts) | |
| 195 while length > 0: | |
| 196 chunk = len_each | |
| 197 if remainder > 0: | |
| 198 chunk += 1 | |
| 199 lines_per_file.append(chunk) | |
| 200 remainder = - 1 | |
| 201 length -= chunk | |
| 202 elif split_params['split_mode'] == 'to_size': | |
| 203 chunk_size = int(split_params['split_size']) | |
| 204 else: | |
| 205 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 206 | |
| 207 f = open(input_files[0], 'rt') | |
| 208 try: | |
| 209 chunk_idx = 0 | |
| 210 file_done = False | |
| 211 part_file = None | |
| 212 while not file_done: | |
| 213 if lines_per_file is None: | |
| 214 this_chunk_size = chunk_size | |
| 215 elif chunk_idx < len(lines_per_file): | |
| 216 this_chunk_size = lines_per_file[chunk_idx] | |
| 217 chunk_idx += 1 | |
| 218 lines_remaining = this_chunk_size | |
| 219 part_file = None | |
| 220 while lines_remaining > 0: | |
| 221 a_line = f.readline() | |
| 222 if a_line == '': | |
| 223 file_done = True | |
| 224 break | |
| 225 if part_file is None: | |
| 226 part_dir = subdir_generator_function() | |
| 227 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 228 part_file = open(part_path, 'w') | |
| 229 part_file.write(a_line) | |
| 230 lines_remaining -= 1 | |
| 231 if part_file is not None: | |
| 232 part_file.close() | |
| 233 except Exception, e: | |
| 234 log.error('Unable to split files: %s' % str(e)) | |
| 235 f.close() | |
| 236 if part_file is not None: | |
| 237 part_file.close() | |
| 238 raise | |
| 239 f.close() | |
| 240 | |
| 241 split = classmethod(split) | |
| 242 | |
| 243 | |
| 244 class glycoct(data.Data): | |
| 245 file_ext = 'glycoct' | |
| 246 line_class = 'line' | |
| 247 | |
| 248 """Add metadata elements""" | |
| 249 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 250 visible=False, no_value=0) | |
| 251 | |
| 252 def write_from_stream(self, dataset, stream): | |
| 253 """Writes data from a stream""" | |
| 254 # write it twice for now | |
| 255 fd, temp_name = tempfile.mkstemp() | |
| 256 while 1: | |
| 257 chunk = stream.read(1048576) | |
| 258 if not chunk: | |
| 259 break | |
| 260 os.write(fd, chunk) | |
| 261 os.close(fd) | |
| 262 # rewrite the file with unix newlines | |
| 263 fp = open(dataset.file_name, 'wt') | |
| 264 for line in file(temp_name, "U"): | |
| 265 line = line.strip() + '\n' | |
| 266 fp.write(line) | |
| 267 fp.close() | |
| 268 | |
| 269 def set_raw_data(self, dataset, data): | |
| 270 """Saves the data on the disc""" | |
| 271 fd, temp_name = tempfile.mkstemp() | |
| 272 os.write(fd, data) | |
| 273 os.close(fd) | |
| 274 # rewrite the file with unix newlines | |
| 275 fp = open(dataset.file_name, 'wt') | |
| 276 for line in file(temp_name, "U"): | |
| 277 line = line.strip() + '\n' | |
| 278 fp.write(line) | |
| 279 fp.close() | |
| 280 os.remove(temp_name) | |
| 281 | |
| 282 def get_mime(self): | |
| 283 """Returns the mime type of the datatype""" | |
| 284 return 'text/plain' | |
| 285 | |
| 286 def set_meta(self, dataset, **kwd): | |
| 287 """ | |
| 288 Set the number of lines of data in dataset. | |
| 289 """ | |
| 290 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 291 | |
| 292 def estimate_file_lines(self, dataset): | |
| 293 """ | |
| 294 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 295 """ | |
| 296 sample_size = 1048576 | |
| 297 dataset_fh = open(dataset.file_name) | |
| 298 dataset_read = dataset_fh.read(sample_size) | |
| 299 dataset_fh.close() | |
| 300 sample_lines = dataset_read.count('\n') | |
| 301 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 302 return est_lines | |
| 303 | |
| 304 def count_data_lines(self, dataset): | |
| 305 """ | |
| 306 Count the number of lines of data in dataset, | |
| 307 skipping all blank lines and comments. | |
| 308 """ | |
| 309 data_lines = 0 | |
| 310 for line in file(dataset.file_name): | |
| 311 line = line.strip() | |
| 312 if line and not line.startswith('#'): | |
| 313 data_lines += 1 | |
| 314 return data_lines | |
| 315 | |
| 316 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 317 """ | |
| 318 Set the peek. This method is used by various subclasses of Text. | |
| 319 """ | |
| 320 if not dataset.dataset.purged: | |
| 321 # The file must exist on disk for the get_file_peek() method | |
| 322 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 323 skipchars=skipchars) | |
| 324 if line_count is None: | |
| 325 # See if line_count is stored in the metadata | |
| 326 if dataset.metadata.data_lines: | |
| 327 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 328 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 329 else: | |
| 330 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 331 # needed to set metadata | |
| 332 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 333 if int(dataset.get_size()) <= 1048576: | |
| 334 # Small dataset, recount all lines and reset peek afterward. | |
| 335 lc = self.count_data_lines(dataset) | |
| 336 dataset.metadata.data_lines = lc | |
| 337 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 338 else: | |
| 339 est_lines = self.estimate_file_lines(dataset) | |
| 340 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 341 inflector.cond_plural(est_lines, self.line_class) ) | |
| 342 else: | |
| 343 dataset.blurb = "%s %s" % ( | |
| 344 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 345 else: | |
| 346 dataset.peek = 'file does not exist' | |
| 347 dataset.blurb = 'file purged from disk' | |
| 348 | |
| 349 def sniff(self, filename): | |
| 350 """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """ | |
| 351 try: | |
| 352 f = open(filename, "r") | |
| 353 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
| 354 lines = f.read() | |
| 355 f.close() | |
| 356 | |
| 357 # if "RES" in firstline and "LIN" in lines: | |
| 358 if "RES" in firstline and "LIN" in lines: | |
| 359 return True | |
| 360 else: | |
| 361 return False | |
| 362 except Exception, e: | |
| 363 # note I am not raising an error rather return False and let another sniffer try to type this data | |
| 364 traceback.print_exc(file=sys.stdout) | |
| 365 return False | |
| 366 | |
| 367 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 368 """ | |
| 369 Split the input files by line. | |
| 370 """ | |
| 371 if split_params is None: | |
| 372 return | |
| 373 | |
| 374 if len(input_datasets) > 1: | |
| 375 raise Exception("Text file splitting does not support multiple files") | |
| 376 input_files = [ds.file_name for ds in input_datasets] | |
| 377 | |
| 378 lines_per_file = None | |
| 379 chunk_size = None | |
| 380 if split_params['split_mode'] == 'number_of_parts': | |
| 381 lines_per_file = [] | |
| 382 # Computing the length is expensive! | |
| 383 def _file_len(fname): | |
| 384 i = 0 | |
| 385 f = open(fname) | |
| 386 for i, l in enumerate(f): | |
| 387 pass | |
| 388 f.close() | |
| 389 return i + 1 | |
| 390 | |
| 391 length = _file_len(input_files[0]) | |
| 392 parts = int(split_params['split_size']) | |
| 393 if length < parts: | |
| 394 parts = length | |
| 395 len_each, remainder = divmod(length, parts) | |
| 396 while length > 0: | |
| 397 chunk = len_each | |
| 398 if remainder > 0: | |
| 399 chunk += 1 | |
| 400 lines_per_file.append(chunk) | |
| 401 remainder = - 1 | |
| 402 length -= chunk | |
| 403 elif split_params['split_mode'] == 'to_size': | |
| 404 chunk_size = int(split_params['split_size']) | |
| 405 else: | |
| 406 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 407 | |
| 408 f = open(input_files[0], 'rt') | |
| 409 try: | |
| 410 chunk_idx = 0 | |
| 411 file_done = False | |
| 412 part_file = None | |
| 413 while not file_done: | |
| 414 if lines_per_file is None: | |
| 415 this_chunk_size = chunk_size | |
| 416 elif chunk_idx < len(lines_per_file): | |
| 417 this_chunk_size = lines_per_file[chunk_idx] | |
| 418 chunk_idx += 1 | |
| 419 lines_remaining = this_chunk_size | |
| 420 part_file = None | |
| 421 while lines_remaining > 0: | |
| 422 a_line = f.readline() | |
| 423 if a_line == '': | |
| 424 file_done = True | |
| 425 break | |
| 426 if part_file is None: | |
| 427 part_dir = subdir_generator_function() | |
| 428 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 429 part_file = open(part_path, 'w') | |
| 430 part_file.write(a_line) | |
| 431 lines_remaining -= 1 | |
| 432 if part_file is not None: | |
| 433 part_file.close() | |
| 434 except Exception, e: | |
| 435 log.error('Unable to split files: %s' % str(e)) | |
| 436 f.close() | |
| 437 if part_file is not None: | |
| 438 part_file.close() | |
| 439 raise | |
| 440 f.close() | |
| 441 | |
| 442 split = classmethod(split) | |
| 443 | |
| 444 # ------------- Utility methods -------------- | |
| 445 | |
| 446 # nice_size used to be here, but to resolve cyclical dependencies it's been | |
| 447 # moved to galaxy.util. It belongs there anyway since it's used outside | |
| 448 # datatypes. | |
| 449 nice_size = util.nice_size | |
| 450 | |
| 451 | |
| 452 def get_test_fname(fname): | |
| 453 """Returns test data filename""" | |
| 454 path, name = os.path.split(__file__) | |
| 455 full_path = os.path.join(path, 'test', fname) | |
| 456 return full_path | |
| 457 | |
| 458 | |
| 459 def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]): | |
| 460 """ | |
| 461 Returns the first LINE_COUNT lines wrapped to WIDTH | |
| 462 | |
| 463 ## >>> fname = get_test_fname('4.bed') | |
| 464 ## >>> get_file_peek(fname) | |
| 465 ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n' | |
| 466 | |
| 467 """ | |
| 468 # Set size for file.readline() to a negative number to force it to | |
| 469 # read until either a newline or EOF. Needed for datasets with very | |
| 470 # long lines. | |
| 471 if WIDTH == 'unlimited': | |
| 472 WIDTH = -1 | |
| 473 lines = [] | |
| 474 count = 0 | |
| 475 file_type = None | |
| 476 data_checked = False | |
| 477 temp = open(file_name, "U") | |
| 478 while count <= LINE_COUNT: | |
| 479 line = temp.readline(WIDTH) | |
| 480 if line and not is_multi_byte and not data_checked: | |
| 481 # See if we have a compressed or binary file | |
| 482 if line[0:2] == util.gzip_magic: | |
| 483 file_type = 'gzipped' | |
| 484 break | |
| 485 else: | |
| 486 for char in line: | |
| 487 if ord(char) > 128: | |
| 488 file_type = 'binary' | |
| 489 break | |
| 490 data_checked = True | |
| 491 if file_type in ['gzipped', 'binary']: | |
| 492 break | |
| 493 skip_line = False | |
| 494 for skipchar in skipchars: | |
| 495 if line.startswith(skipchar): | |
| 496 skip_line = True | |
| 497 break | |
| 498 if not skip_line: | |
| 499 lines.append(line) | |
| 500 count += 1 | |
| 501 temp.close() | |
| 502 if file_type in ['gzipped', 'binary']: | |
| 503 text = "%s file" % file_type | |
| 504 else: | |
| 505 try: | |
| 506 text = unicode('\n'.join(lines), 'utf-8') | |
| 507 except UnicodeDecodeError: | |
| 508 text = "binary/unknown file" | |
| 509 return text | |
| 510 | |
| 511 | |
| 512 class glycoct_xml(data.Data): | |
| 513 file_ext = 'glycoct_xml' | |
| 514 line_class = 'line' | |
| 515 | |
| 516 """Add metadata elements""" | |
| 517 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 518 visible=False, no_value=0) | |
| 519 | |
| 520 def write_from_stream(self, dataset, stream): | |
| 521 """Writes data from a stream""" | |
| 522 # write it twice for now | |
| 523 fd, temp_name = tempfile.mkstemp() | |
| 524 while 1: | |
| 525 chunk = stream.read(1048576) | |
| 526 if not chunk: | |
| 527 break | |
| 528 os.write(fd, chunk) | |
| 529 os.close(fd) | |
| 530 # rewrite the file with unix newlines | |
| 531 fp = open(dataset.file_name, 'wt') | |
| 532 for line in file(temp_name, "U"): | |
| 533 line = line.strip() + '\n' | |
| 534 fp.write(line) | |
| 535 fp.close() | |
| 536 | |
| 537 def set_raw_data(self, dataset, data): | |
| 538 """Saves the data on the disc""" | |
| 539 fd, temp_name = tempfile.mkstemp() | |
| 540 os.write(fd, data) | |
| 541 os.close(fd) | |
| 542 # rewrite the file with unix newlines | |
| 543 fp = open(dataset.file_name, 'wt') | |
| 544 for line in file(temp_name, "U"): | |
| 545 line = line.strip() + '\n' | |
| 546 fp.write(line) | |
| 547 fp.close() | |
| 548 os.remove(temp_name) | |
| 549 | |
| 550 def get_mime(self): | |
| 551 """Returns the mime type of the datatype""" | |
| 552 return 'text/xml' | |
| 553 | |
| 554 def set_meta(self, dataset, **kwd): | |
| 555 """ | |
| 556 Set the number of lines of data in dataset. | |
| 557 """ | |
| 558 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 559 | |
| 560 def estimate_file_lines(self, dataset): | |
| 561 """ | |
| 562 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 563 """ | |
| 564 sample_size = 1048576 | |
| 565 dataset_fh = open(dataset.file_name) | |
| 566 dataset_read = dataset_fh.read(sample_size) | |
| 567 dataset_fh.close() | |
| 568 sample_lines = dataset_read.count('\n') | |
| 569 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 570 return est_lines | |
| 571 | |
| 572 def count_data_lines(self, dataset): | |
| 573 """ | |
| 574 Count the number of lines of data in dataset, | |
| 575 skipping all blank lines and comments. | |
| 576 """ | |
| 577 data_lines = 0 | |
| 578 for line in file(dataset.file_name): | |
| 579 line = line.strip() | |
| 580 if line and not line.startswith('#'): | |
| 581 data_lines += 1 | |
| 582 return data_lines | |
| 583 | |
| 584 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 585 """ | |
| 586 Set the peek. This method is used by various subclasses of Text. | |
| 587 """ | |
| 588 if not dataset.dataset.purged: | |
| 589 # The file must exist on disk for the get_file_peek() method | |
| 590 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 591 skipchars=skipchars) | |
| 592 if line_count is None: | |
| 593 # See if line_count is stored in the metadata | |
| 594 if dataset.metadata.data_lines: | |
| 595 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 596 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 597 else: | |
| 598 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 599 # needed to set metadata | |
| 600 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 601 if int(dataset.get_size()) <= 1048576: | |
| 602 # Small dataset, recount all lines and reset peek afterward. | |
| 603 lc = self.count_data_lines(dataset) | |
| 604 dataset.metadata.data_lines = lc | |
| 605 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 606 else: | |
| 607 est_lines = self.estimate_file_lines(dataset) | |
| 608 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 609 inflector.cond_plural(est_lines, self.line_class) ) | |
| 610 else: | |
| 611 dataset.blurb = "%s %s" % ( | |
| 612 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 613 else: | |
| 614 dataset.peek = 'file does not exist' | |
| 615 dataset.blurb = 'file purged from disk' | |
| 616 | |
| 617 def sniff(self, filename): | |
| 618 """All glycoct XML files should use the rings form determination script """ | |
| 619 try: | |
| 620 from suds.client import Client | |
| 621 | |
| 622 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
| 623 client = Client(url) | |
| 624 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
| 625 if response.array[0] == "GlycoCT": | |
| 626 return True | |
| 627 else: | |
| 628 return False | |
| 629 except ImportError: | |
| 630 # cannot use import suds so use simple checker | |
| 631 print "using glycoct XML simple checker" | |
| 632 import xml.etree.cElementTree as ET | |
| 633 | |
| 634 tree = ET.parse(filename) | |
| 635 root = tree.getroot() | |
| 636 if root.tag == 'sugar': | |
| 637 print root.tag, root.attrib | |
| 638 return True | |
| 639 else: | |
| 640 return False | |
| 641 except Exception, e: | |
| 642 # note I am not raising an error rather return False and let another sniffer try to type this data | |
| 643 traceback.print_exc(file=sys.stdout) | |
| 644 return False | |
| 645 | |
| 646 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 647 """ | |
| 648 Split the input files by line. | |
| 649 """ | |
| 650 if split_params is None: | |
| 651 return | |
| 652 | |
| 653 if len(input_datasets) > 1: | |
| 654 raise Exception("Text file splitting does not support multiple files") | |
| 655 input_files = [ds.file_name for ds in input_datasets] | |
| 656 | |
| 657 lines_per_file = None | |
| 658 chunk_size = None | |
| 659 if split_params['split_mode'] == 'number_of_parts': | |
| 660 lines_per_file = [] | |
| 661 # Computing the length is expensive! | |
| 662 def _file_len(fname): | |
| 663 i = 0 | |
| 664 f = open(fname) | |
| 665 for i, l in enumerate(f): | |
| 666 pass | |
| 667 f.close() | |
| 668 return i + 1 | |
| 669 | |
| 670 length = _file_len(input_files[0]) | |
| 671 parts = int(split_params['split_size']) | |
| 672 if length < parts: | |
| 673 parts = length | |
| 674 len_each, remainder = divmod(length, parts) | |
| 675 while length > 0: | |
| 676 chunk = len_each | |
| 677 if remainder > 0: | |
| 678 chunk += 1 | |
| 679 lines_per_file.append(chunk) | |
| 680 remainder = - 1 | |
| 681 length -= chunk | |
| 682 elif split_params['split_mode'] == 'to_size': | |
| 683 chunk_size = int(split_params['split_size']) | |
| 684 else: | |
| 685 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 686 | |
| 687 f = open(input_files[0], 'rt') | |
| 688 try: | |
| 689 chunk_idx = 0 | |
| 690 file_done = False | |
| 691 part_file = None | |
| 692 while not file_done: | |
| 693 if lines_per_file is None: | |
| 694 this_chunk_size = chunk_size | |
| 695 elif chunk_idx < len(lines_per_file): | |
| 696 this_chunk_size = lines_per_file[chunk_idx] | |
| 697 chunk_idx += 1 | |
| 698 lines_remaining = this_chunk_size | |
| 699 part_file = None | |
| 700 while lines_remaining > 0: | |
| 701 a_line = f.readline() | |
| 702 if a_line == '': | |
| 703 file_done = True | |
| 704 break | |
| 705 if part_file is None: | |
| 706 part_dir = subdir_generator_function() | |
| 707 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 708 part_file = open(part_path, 'w') | |
| 709 part_file.write(a_line) | |
| 710 lines_remaining -= 1 | |
| 711 if part_file is not None: | |
| 712 part_file.close() | |
| 713 except Exception, e: | |
| 714 log.error('Unable to split files: %s' % str(e)) | |
| 715 f.close() | |
| 716 if part_file is not None: | |
| 717 part_file.close() | |
| 718 raise | |
| 719 f.close() | |
| 720 | |
| 721 split = classmethod(split) | |
| 722 | |
| 723 | |
| 724 class glydeii(data.Data): | |
| 725 file_ext = 'glydeii' | |
| 726 line_class = 'line' | |
| 727 | |
| 728 """Add metadata elements""" | |
| 729 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 730 visible=False, no_value=0) | |
| 731 | |
| 732 def write_from_stream(self, dataset, stream): | |
| 733 """Writes data from a stream""" | |
| 734 # write it twice for now | |
| 735 fd, temp_name = tempfile.mkstemp() | |
| 736 while 1: | |
| 737 chunk = stream.read(1048576) | |
| 738 if not chunk: | |
| 739 break | |
| 740 os.write(fd, chunk) | |
| 741 os.close(fd) | |
| 742 # rewrite the file with unix newlines | |
| 743 fp = open(dataset.file_name, 'wt') | |
| 744 for line in file(temp_name, "U"): | |
| 745 line = line.strip() + '\n' | |
| 746 fp.write(line) | |
| 747 fp.close() | |
| 748 | |
| 749 def set_raw_data(self, dataset, data): | |
| 750 """Saves the data on the disc""" | |
| 751 fd, temp_name = tempfile.mkstemp() | |
| 752 os.write(fd, data) | |
| 753 os.close(fd) | |
| 754 # rewrite the file with unix newlines | |
| 755 fp = open(dataset.file_name, 'wt') | |
| 756 for line in file(temp_name, "U"): | |
| 757 line = line.strip() + '\n' | |
| 758 fp.write(line) | |
| 759 fp.close() | |
| 760 os.remove(temp_name) | |
| 761 | |
| 762 def get_mime(self): | |
| 763 """Returns the mime type of the datatype""" | |
| 764 return 'text/xml' | |
| 765 | |
| 766 def set_meta(self, dataset, **kwd): | |
| 767 """ | |
| 768 Set the number of lines of data in dataset. | |
| 769 """ | |
| 770 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 771 | |
| 772 def estimate_file_lines(self, dataset): | |
| 773 """ | |
| 774 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 775 """ | |
| 776 sample_size = 1048576 | |
| 777 dataset_fh = open(dataset.file_name) | |
| 778 dataset_read = dataset_fh.read(sample_size) | |
| 779 dataset_fh.close() | |
| 780 sample_lines = dataset_read.count('\n') | |
| 781 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 782 return est_lines | |
| 783 | |
| 784 def count_data_lines(self, dataset): | |
| 785 """ | |
| 786 Count the number of lines of data in dataset, | |
| 787 skipping all blank lines and comments. | |
| 788 """ | |
| 789 data_lines = 0 | |
| 790 for line in file(dataset.file_name): | |
| 791 line = line.strip() | |
| 792 if line and not line.startswith('#'): | |
| 793 data_lines += 1 | |
| 794 return data_lines | |
| 795 | |
| 796 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 797 """ | |
| 798 Set the peek. This method is used by various subclasses of Text. | |
| 799 """ | |
| 800 if not dataset.dataset.purged: | |
| 801 # The file must exist on disk for the get_file_peek() method | |
| 802 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 803 skipchars=skipchars) | |
| 804 if line_count is None: | |
| 805 # See if line_count is stored in the metadata | |
| 806 if dataset.metadata.data_lines: | |
| 807 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 808 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 809 else: | |
| 810 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 811 # needed to set metadata | |
| 812 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 813 if int(dataset.get_size()) <= 1048576: | |
| 814 # Small dataset, recount all lines and reset peek afterward. | |
| 815 lc = self.count_data_lines(dataset) | |
| 816 dataset.metadata.data_lines = lc | |
| 817 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 818 else: | |
| 819 est_lines = self.estimate_file_lines(dataset) | |
| 820 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 821 inflector.cond_plural(est_lines, self.line_class) ) | |
| 822 else: | |
| 823 dataset.blurb = "%s %s" % ( | |
| 824 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 825 else: | |
| 826 dataset.peek = 'file does not exist' | |
| 827 dataset.blurb = 'file purged from disk' | |
| 828 | |
| 829 def sniff(self, filename): | |
| 830 """All GlydeII XML files should use the rings form determination script """ | |
| 831 try: | |
| 832 from suds.client import Client | |
| 833 | |
| 834 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
| 835 client = Client(url) | |
| 836 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
| 837 if response.array[0] == "GLYDEII": | |
| 838 return True | |
| 839 else: | |
| 840 return False | |
| 841 except ImportError: | |
| 842 # cannot use import suds so use simple checker | |
| 843 print "using GlydeII simple checker" | |
| 844 import xml.etree.cElementTree as ET | |
| 845 | |
| 846 tree = ET.parse(filename) | |
| 847 root = tree.getroot() | |
| 848 if root.tag == 'GlydeII': | |
| 849 print root.tag | |
| 850 return True | |
| 851 else: | |
| 852 return False | |
| 853 except Exception, e: | |
| 854 # note I am not raising an error rather return False and let another sniffer try to type this data | |
| 855 traceback.print_exc(file=sys.stdout) | |
| 856 return False | |
| 857 | |
| 858 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 859 """ | |
| 860 Split the input files by line. | |
| 861 """ | |
| 862 if split_params is None: | |
| 863 return | |
| 864 | |
| 865 if len(input_datasets) > 1: | |
| 866 raise Exception("Text file splitting does not support multiple files") | |
| 867 input_files = [ds.file_name for ds in input_datasets] | |
| 868 | |
| 869 lines_per_file = None | |
| 870 chunk_size = None | |
| 871 if split_params['split_mode'] == 'number_of_parts': | |
| 872 lines_per_file = [] | |
| 873 # Computing the length is expensive! | |
| 874 def _file_len(fname): | |
| 875 i = 0 | |
| 876 f = open(fname) | |
| 877 for i, l in enumerate(f): | |
| 878 pass | |
| 879 f.close() | |
| 880 return i + 1 | |
| 881 | |
| 882 length = _file_len(input_files[0]) | |
| 883 parts = int(split_params['split_size']) | |
| 884 if length < parts: | |
| 885 parts = length | |
| 886 len_each, remainder = divmod(length, parts) | |
| 887 while length > 0: | |
| 888 chunk = len_each | |
| 889 if remainder > 0: | |
| 890 chunk += 1 | |
| 891 lines_per_file.append(chunk) | |
| 892 remainder = - 1 | |
| 893 length -= chunk | |
| 894 elif split_params['split_mode'] == 'to_size': | |
| 895 chunk_size = int(split_params['split_size']) | |
| 896 else: | |
| 897 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 898 | |
| 899 f = open(input_files[0], 'rt') | |
| 900 try: | |
| 901 chunk_idx = 0 | |
| 902 file_done = False | |
| 903 part_file = None | |
| 904 while not file_done: | |
| 905 if lines_per_file is None: | |
| 906 this_chunk_size = chunk_size | |
| 907 elif chunk_idx < len(lines_per_file): | |
| 908 this_chunk_size = lines_per_file[chunk_idx] | |
| 909 chunk_idx += 1 | |
| 910 lines_remaining = this_chunk_size | |
| 911 part_file = None | |
| 912 while lines_remaining > 0: | |
| 913 a_line = f.readline() | |
| 914 if a_line == '': | |
| 915 file_done = True | |
| 916 break | |
| 917 if part_file is None: | |
| 918 part_dir = subdir_generator_function() | |
| 919 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 920 part_file = open(part_path, 'w') | |
| 921 part_file.write(a_line) | |
| 922 lines_remaining -= 1 | |
| 923 if part_file is not None: | |
| 924 part_file.close() | |
| 925 except Exception, e: | |
| 926 log.error('Unable to split files: %s' % str(e)) | |
| 927 f.close() | |
| 928 if part_file is not None: | |
| 929 part_file.close() | |
| 930 raise | |
| 931 f.close() | |
| 932 | |
| 933 split = classmethod(split) | |
| 934 | |
| 935 | |
| 936 class linucs(data.Data): | |
| 937 file_ext = 'linucs' | |
| 938 line_class = 'line' | |
| 939 | |
| 940 """Add metadata elements""" | |
| 941 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 942 visible=False, no_value=0) | |
| 943 | |
| 944 def write_from_stream(self, dataset, stream): | |
| 945 """Writes data from a stream""" | |
| 946 # write it twice for now | |
| 947 fd, temp_name = tempfile.mkstemp() | |
| 948 while 1: | |
| 949 chunk = stream.read(1048576) | |
| 950 if not chunk: | |
| 951 break | |
| 952 os.write(fd, chunk) | |
| 953 os.close(fd) | |
| 954 # rewrite the file with unix newlines | |
| 955 fp = open(dataset.file_name, 'wt') | |
| 956 for line in file(temp_name, "U"): | |
| 957 line = line.strip() + '\n' | |
| 958 fp.write(line) | |
| 959 fp.close() | |
| 960 | |
| 961 def set_raw_data(self, dataset, data): | |
| 962 """Saves the data on the disc""" | |
| 963 fd, temp_name = tempfile.mkstemp() | |
| 964 os.write(fd, data) | |
| 965 os.close(fd) | |
| 966 # rewrite the file with unix newlines | |
| 967 fp = open(dataset.file_name, 'wt') | |
| 968 for line in file(temp_name, "U"): | |
| 969 line = line.strip() + '\n' | |
| 970 fp.write(line) | |
| 971 fp.close() | |
| 972 os.remove(temp_name) | |
| 973 | |
| 974 def get_mime(self): | |
| 975 """Returns the mime type of the datatype""" | |
| 976 return 'text/plain' | |
| 977 | |
| 978 def set_meta(self, dataset, **kwd): | |
| 979 """ | |
| 980 Set the number of lines of data in dataset. | |
| 981 """ | |
| 982 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 983 | |
| 984 def estimate_file_lines(self, dataset): | |
| 985 """ | |
| 986 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 987 """ | |
| 988 sample_size = 1048576 | |
| 989 dataset_fh = open(dataset.file_name) | |
| 990 dataset_read = dataset_fh.read(sample_size) | |
| 991 dataset_fh.close() | |
| 992 sample_lines = dataset_read.count('\n') | |
| 993 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 994 return est_lines | |
| 995 | |
| 996 def count_data_lines(self, dataset): | |
| 997 """ | |
| 998 Count the number of lines of data in dataset, | |
| 999 skipping all blank lines and comments. | |
| 1000 """ | |
| 1001 data_lines = 0 | |
| 1002 for line in file(dataset.file_name): | |
| 1003 line = line.strip() | |
| 1004 if line and not line.startswith('#'): | |
| 1005 data_lines += 1 | |
| 1006 return data_lines | |
| 1007 | |
| 1008 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 1009 """ | |
| 1010 Set the peek. This method is used by various subclasses of Text. | |
| 1011 """ | |
| 1012 if not dataset.dataset.purged: | |
| 1013 # The file must exist on disk for the get_file_peek() method | |
| 1014 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 1015 skipchars=skipchars) | |
| 1016 if line_count is None: | |
| 1017 # See if line_count is stored in the metadata | |
| 1018 if dataset.metadata.data_lines: | |
| 1019 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 1020 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 1021 else: | |
| 1022 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 1023 # needed to set metadata | |
| 1024 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 1025 if int(dataset.get_size()) <= 1048576: | |
| 1026 # Small dataset, recount all lines and reset peek afterward. | |
| 1027 lc = self.count_data_lines(dataset) | |
| 1028 dataset.metadata.data_lines = lc | |
| 1029 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 1030 else: | |
| 1031 est_lines = self.estimate_file_lines(dataset) | |
| 1032 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 1033 inflector.cond_plural(est_lines, self.line_class) ) | |
| 1034 else: | |
| 1035 dataset.blurb = "%s %s" % ( | |
| 1036 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 1037 else: | |
| 1038 dataset.peek = 'file does not exist' | |
| 1039 dataset.blurb = 'file purged from disk' | |
| 1040 | |
| 1041 def sniff(self, filename): | |
| 1042 """All LINUCS files should use the rings form determination script """ | |
| 1043 try: | |
| 1044 from suds.client import Client | |
| 1045 | |
| 1046 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
| 1047 client = Client(url) | |
| 1048 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
| 1049 if response.array[0] == "LINUCS": | |
| 1050 return True | |
| 1051 else: | |
| 1052 return False | |
| 1053 except ImportError: | |
| 1054 # cannot use import suds so use simple checker | |
| 1055 print "using LINUCS simple checker" | |
| 1056 | |
| 1057 f = open(filename, "r") | |
| 1058 firstline = f.readline() | |
| 1059 f.close() | |
| 1060 | |
| 1061 if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline: | |
| 1062 return True | |
| 1063 else: | |
| 1064 return False | |
| 1065 except Exception, e: | |
| 1066 # note I am not raising an error rather return False and let another sniffer try to type this data | |
| 1067 traceback.print_exc(file=sys.stdout) | |
| 1068 return False | |
| 1069 | |
| 1070 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 1071 """ | |
| 1072 Split the input files by line. | |
| 1073 """ | |
| 1074 if split_params is None: | |
| 1075 return | |
| 1076 | |
| 1077 if len(input_datasets) > 1: | |
| 1078 raise Exception("Text file splitting does not support multiple files") | |
| 1079 input_files = [ds.file_name for ds in input_datasets] | |
| 1080 | |
| 1081 lines_per_file = None | |
| 1082 chunk_size = None | |
| 1083 if split_params['split_mode'] == 'number_of_parts': | |
| 1084 lines_per_file = [] | |
| 1085 # Computing the length is expensive! | |
| 1086 def _file_len(fname): | |
| 1087 i = 0 | |
| 1088 f = open(fname) | |
| 1089 for i, l in enumerate(f): | |
| 1090 pass | |
| 1091 f.close() | |
| 1092 return i + 1 | |
| 1093 | |
| 1094 length = _file_len(input_files[0]) | |
| 1095 parts = int(split_params['split_size']) | |
| 1096 if length < parts: | |
| 1097 parts = length | |
| 1098 len_each, remainder = divmod(length, parts) | |
| 1099 while length > 0: | |
| 1100 chunk = len_each | |
| 1101 if remainder > 0: | |
| 1102 chunk += 1 | |
| 1103 lines_per_file.append(chunk) | |
| 1104 remainder = - 1 | |
| 1105 length -= chunk | |
| 1106 elif split_params['split_mode'] == 'to_size': | |
| 1107 chunk_size = int(split_params['split_size']) | |
| 1108 else: | |
| 1109 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 1110 | |
| 1111 f = open(input_files[0], 'rt') | |
| 1112 try: | |
| 1113 chunk_idx = 0 | |
| 1114 file_done = False | |
| 1115 part_file = None | |
| 1116 while not file_done: | |
| 1117 if lines_per_file is None: | |
| 1118 this_chunk_size = chunk_size | |
| 1119 elif chunk_idx < len(lines_per_file): | |
| 1120 this_chunk_size = lines_per_file[chunk_idx] | |
| 1121 chunk_idx += 1 | |
| 1122 lines_remaining = this_chunk_size | |
| 1123 part_file = None | |
| 1124 while lines_remaining > 0: | |
| 1125 a_line = f.readline() | |
| 1126 if a_line == '': | |
| 1127 file_done = True | |
| 1128 break | |
| 1129 if part_file is None: | |
| 1130 part_dir = subdir_generator_function() | |
| 1131 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 1132 part_file = open(part_path, 'w') | |
| 1133 part_file.write(a_line) | |
| 1134 lines_remaining -= 1 | |
| 1135 if part_file is not None: | |
| 1136 part_file.close() | |
| 1137 except Exception, e: | |
| 1138 log.error('Unable to split files: %s' % str(e)) | |
| 1139 f.close() | |
| 1140 if part_file is not None: | |
| 1141 part_file.close() | |
| 1142 raise | |
| 1143 f.close() | |
| 1144 | |
| 1145 split = classmethod(split) | |
| 1146 | |
| 1147 | |
| 1148 class iupac(data.Data): | |
| 1149 file_ext = 'iupac' | |
| 1150 line_class = 'line' | |
| 1151 | |
| 1152 """Add metadata elements""" | |
| 1153 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 1154 visible=False, no_value=0) | |
| 1155 | |
| 1156 def write_from_stream(self, dataset, stream): | |
| 1157 """Writes data from a stream""" | |
| 1158 # write it twice for now | |
| 1159 fd, temp_name = tempfile.mkstemp() | |
| 1160 while 1: | |
| 1161 chunk = stream.read(1048576) | |
| 1162 if not chunk: | |
| 1163 break | |
| 1164 os.write(fd, chunk) | |
| 1165 os.close(fd) | |
| 1166 # rewrite the file with unix newlines | |
| 1167 fp = open(dataset.file_name, 'wt') | |
| 1168 for line in file(temp_name, "U"): | |
| 1169 line = line.strip() + '\n' | |
| 1170 fp.write(line) | |
| 1171 fp.close() | |
| 1172 | |
| 1173 def set_raw_data(self, dataset, data): | |
| 1174 """Saves the data on the disc""" | |
| 1175 fd, temp_name = tempfile.mkstemp() | |
| 1176 os.write(fd, data) | |
| 1177 os.close(fd) | |
| 1178 # rewrite the file with unix newlines | |
| 1179 fp = open(dataset.file_name, 'wt') | |
| 1180 for line in file(temp_name, "U"): | |
| 1181 line = line.strip() + '\n' | |
| 1182 fp.write(line) | |
| 1183 fp.close() | |
| 1184 os.remove(temp_name) | |
| 1185 | |
| 1186 def get_mime(self): | |
| 1187 """Returns the mime type of the datatype""" | |
| 1188 return 'text/plain' | |
| 1189 | |
| 1190 def set_meta(self, dataset, **kwd): | |
| 1191 """ | |
| 1192 Set the number of lines of data in dataset. | |
| 1193 """ | |
| 1194 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 1195 | |
| 1196 def estimate_file_lines(self, dataset): | |
| 1197 """ | |
| 1198 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 1199 """ | |
| 1200 sample_size = 1048576 | |
| 1201 dataset_fh = open(dataset.file_name) | |
| 1202 dataset_read = dataset_fh.read(sample_size) | |
| 1203 dataset_fh.close() | |
| 1204 sample_lines = dataset_read.count('\n') | |
| 1205 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 1206 return est_lines | |
| 1207 | |
| 1208 def count_data_lines(self, dataset): | |
| 1209 """ | |
| 1210 Count the number of lines of data in dataset, | |
| 1211 skipping all blank lines and comments. | |
| 1212 """ | |
| 1213 data_lines = 0 | |
| 1214 for line in file(dataset.file_name): | |
| 1215 line = line.strip() | |
| 1216 if line and not line.startswith('#'): | |
| 1217 data_lines += 1 | |
| 1218 return data_lines | |
| 1219 | |
| 1220 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 1221 """ | |
| 1222 Set the peek. This method is used by various subclasses of Text. | |
| 1223 """ | |
| 1224 if not dataset.dataset.purged: | |
| 1225 # The file must exist on disk for the get_file_peek() method | |
| 1226 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 1227 skipchars=skipchars) | |
| 1228 if line_count is None: | |
| 1229 # See if line_count is stored in the metadata | |
| 1230 if dataset.metadata.data_lines: | |
| 1231 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 1232 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 1233 else: | |
| 1234 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 1235 # needed to set metadata | |
| 1236 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 1237 if int(dataset.get_size()) <= 1048576: | |
| 1238 # Small dataset, recount all lines and reset peek afterward. | |
| 1239 lc = self.count_data_lines(dataset) | |
| 1240 dataset.metadata.data_lines = lc | |
| 1241 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 1242 else: | |
| 1243 est_lines = self.estimate_file_lines(dataset) | |
| 1244 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 1245 inflector.cond_plural(est_lines, self.line_class) ) | |
| 1246 else: | |
| 1247 dataset.blurb = "%s %s" % ( | |
| 1248 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 1249 else: | |
| 1250 dataset.peek = 'file does not exist' | |
| 1251 dataset.blurb = 'file purged from disk' | |
| 1252 | |
| 1253 def sniff(self, filename): | |
| 1254 """All IUPAC files should use the rings form determination script """ | |
| 1255 try: | |
| 1256 from suds.client import Client | |
| 1257 | |
| 1258 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
| 1259 client = Client(url) | |
| 1260 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
| 1261 if response.array[0] == "IUPAC": | |
| 1262 return True | |
| 1263 else: | |
| 1264 return False | |
| 1265 except ImportError: | |
| 1266 # cannot use import suds so use simple checker | |
| 1267 print "using IUPAC simple checker" | |
| 1268 f = open(filename, "r") | |
| 1269 firstline = f.readline() | |
| 1270 f.close() | |
| 1271 | |
| 1272 if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline: | |
| 1273 if "{" in firstline or "}" in firstline: | |
| 1274 return False | |
| 1275 else: | |
| 1276 return True | |
| 1277 else: | |
| 1278 return False | |
| 1279 except Exception, e: | |
| 1280 # note I am not raising an error rather return False and let another sniffer try to type this data | |
| 1281 traceback.print_exc(file=sys.stdout) | |
| 1282 return False | |
| 1283 | |
| 1284 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 1285 """ | |
| 1286 Split the input files by line. | |
| 1287 """ | |
| 1288 if split_params is None: | |
| 1289 return | |
| 1290 | |
| 1291 if len(input_datasets) > 1: | |
| 1292 raise Exception("Text file splitting does not support multiple files") | |
| 1293 input_files = [ds.file_name for ds in input_datasets] | |
| 1294 | |
| 1295 lines_per_file = None | |
| 1296 chunk_size = None | |
| 1297 if split_params['split_mode'] == 'number_of_parts': | |
| 1298 lines_per_file = [] | |
| 1299 # Computing the length is expensive! | |
| 1300 def _file_len(fname): | |
| 1301 i = 0 | |
| 1302 f = open(fname) | |
| 1303 for i, l in enumerate(f): | |
| 1304 pass | |
| 1305 f.close() | |
| 1306 return i + 1 | |
| 1307 | |
| 1308 length = _file_len(input_files[0]) | |
| 1309 parts = int(split_params['split_size']) | |
| 1310 if length < parts: | |
| 1311 parts = length | |
| 1312 len_each, remainder = divmod(length, parts) | |
| 1313 while length > 0: | |
| 1314 chunk = len_each | |
| 1315 if remainder > 0: | |
| 1316 chunk += 1 | |
| 1317 lines_per_file.append(chunk) | |
| 1318 remainder = - 1 | |
| 1319 length -= chunk | |
| 1320 elif split_params['split_mode'] == 'to_size': | |
| 1321 chunk_size = int(split_params['split_size']) | |
| 1322 else: | |
| 1323 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 1324 | |
| 1325 f = open(input_files[0], 'rt') | |
| 1326 try: | |
| 1327 chunk_idx = 0 | |
| 1328 file_done = False | |
| 1329 part_file = None | |
| 1330 while not file_done: | |
| 1331 if lines_per_file is None: | |
| 1332 this_chunk_size = chunk_size | |
| 1333 elif chunk_idx < len(lines_per_file): | |
| 1334 this_chunk_size = lines_per_file[chunk_idx] | |
| 1335 chunk_idx += 1 | |
| 1336 lines_remaining = this_chunk_size | |
| 1337 part_file = None | |
| 1338 while lines_remaining > 0: | |
| 1339 a_line = f.readline() | |
| 1340 if a_line == '': | |
| 1341 file_done = True | |
| 1342 break | |
| 1343 if part_file is None: | |
| 1344 part_dir = subdir_generator_function() | |
| 1345 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 1346 part_file = open(part_path, 'w') | |
| 1347 part_file.write(a_line) | |
| 1348 lines_remaining -= 1 | |
| 1349 if part_file is not None: | |
| 1350 part_file.close() | |
| 1351 except Exception, e: | |
| 1352 log.error('Unable to split files: %s' % str(e)) | |
| 1353 f.close() | |
| 1354 if part_file is not None: | |
| 1355 part_file.close() | |
| 1356 raise | |
| 1357 f.close() | |
| 1358 | |
| 1359 split = classmethod(split) | |
| 1360 | |
| 1361 | |
| 1362 class linearcode(data.Data): | |
| 1363 file_ext = 'linearcode' | |
| 1364 line_class = 'line' | |
| 1365 | |
| 1366 """Add metadata elements""" | |
| 1367 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 1368 visible=False, no_value=0) | |
| 1369 | |
| 1370 def write_from_stream(self, dataset, stream): | |
| 1371 """Writes data from a stream""" | |
| 1372 # write it twice for now | |
| 1373 fd, temp_name = tempfile.mkstemp() | |
| 1374 while 1: | |
| 1375 chunk = stream.read(1048576) | |
| 1376 if not chunk: | |
| 1377 break | |
| 1378 os.write(fd, chunk) | |
| 1379 os.close(fd) | |
| 1380 # rewrite the file with unix newlines | |
| 1381 fp = open(dataset.file_name, 'wt') | |
| 1382 for line in file(temp_name, "U"): | |
| 1383 line = line.strip() + '\n' | |
| 1384 fp.write(line) | |
| 1385 fp.close() | |
| 1386 | |
| 1387 def set_raw_data(self, dataset, data): | |
| 1388 """Saves the data on the disc""" | |
| 1389 fd, temp_name = tempfile.mkstemp() | |
| 1390 os.write(fd, data) | |
| 1391 os.close(fd) | |
| 1392 # rewrite the file with unix newlines | |
| 1393 fp = open(dataset.file_name, 'wt') | |
| 1394 for line in file(temp_name, "U"): | |
| 1395 line = line.strip() + '\n' | |
| 1396 fp.write(line) | |
| 1397 fp.close() | |
| 1398 os.remove(temp_name) | |
| 1399 | |
| 1400 def get_mime(self): | |
| 1401 """Returns the mime type of the datatype""" | |
| 1402 return 'text/plain' | |
| 1403 | |
| 1404 def set_meta(self, dataset, **kwd): | |
| 1405 """ | |
| 1406 Set the number of lines of data in dataset. | |
| 1407 """ | |
| 1408 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 1409 | |
| 1410 def estimate_file_lines(self, dataset): | |
| 1411 """ | |
| 1412 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 1413 """ | |
| 1414 sample_size = 1048576 | |
| 1415 dataset_fh = open(dataset.file_name) | |
| 1416 dataset_read = dataset_fh.read(sample_size) | |
| 1417 dataset_fh.close() | |
| 1418 sample_lines = dataset_read.count('\n') | |
| 1419 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 1420 return est_lines | |
| 1421 | |
| 1422 def count_data_lines(self, dataset): | |
| 1423 """ | |
| 1424 Count the number of lines of data in dataset, | |
| 1425 skipping all blank lines and comments. | |
| 1426 """ | |
| 1427 data_lines = 0 | |
| 1428 for line in file(dataset.file_name): | |
| 1429 line = line.strip() | |
| 1430 if line and not line.startswith('#'): | |
| 1431 data_lines += 1 | |
| 1432 return data_lines | |
| 1433 | |
| 1434 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 1435 """ | |
| 1436 Set the peek. This method is used by various subclasses of Text. | |
| 1437 """ | |
| 1438 if not dataset.dataset.purged: | |
| 1439 # The file must exist on disk for the get_file_peek() method | |
| 1440 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 1441 skipchars=skipchars) | |
| 1442 if line_count is None: | |
| 1443 # See if line_count is stored in the metadata | |
| 1444 if dataset.metadata.data_lines: | |
| 1445 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 1446 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 1447 else: | |
| 1448 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 1449 # needed to set metadata | |
| 1450 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 1451 if int(dataset.get_size()) <= 1048576: | |
| 1452 # Small dataset, recount all lines and reset peek afterward. | |
| 1453 lc = self.count_data_lines(dataset) | |
| 1454 dataset.metadata.data_lines = lc | |
| 1455 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 1456 else: | |
| 1457 est_lines = self.estimate_file_lines(dataset) | |
| 1458 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 1459 inflector.cond_plural(est_lines, self.line_class) ) | |
| 1460 else: | |
| 1461 dataset.blurb = "%s %s" % ( | |
| 1462 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 1463 else: | |
| 1464 dataset.peek = 'file does not exist' | |
| 1465 dataset.blurb = 'file purged from disk' | |
| 1466 | |
| 1467 def sniff(self, filename): | |
| 1468 """All linear code files should use the rings form determination script """ | |
| 1469 try: | |
| 1470 from suds.client import Client | |
| 1471 | |
| 1472 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
| 1473 client = Client(url) | |
| 1474 lcresponse = client.service.DeterminingForm(file(filename, 'r').read()) | |
| 1475 if lcresponse.array[0] == "LinearCode": | |
| 1476 print "LinearCode" | |
| 1477 return True | |
| 1478 else: | |
| 1479 print "Unable to guess format" | |
| 1480 return False | |
| 1481 except ImportError: | |
| 1482 # cannot use import suds so use simple checker | |
| 1483 print "using LinearCode simple checker - nope it does not exist yet" | |
| 1484 return False | |
| 1485 except Exception, e: | |
| 1486 # note I am not raising an error rather return False and let another sniffer try to type this data | |
| 1487 traceback.print_exc(file=sys.stdout) | |
| 1488 return False | |
| 1489 | |
| 1490 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 1491 """ | |
| 1492 Split the input files by line. | |
| 1493 """ | |
| 1494 if split_params is None: | |
| 1495 return | |
| 1496 | |
| 1497 if len(input_datasets) > 1: | |
| 1498 raise Exception("Text file splitting does not support multiple files") | |
| 1499 input_files = [ds.file_name for ds in input_datasets] | |
| 1500 | |
| 1501 lines_per_file = None | |
| 1502 chunk_size = None | |
| 1503 if split_params['split_mode'] == 'number_of_parts': | |
| 1504 lines_per_file = [] | |
| 1505 # Computing the length is expensive! | |
| 1506 def _file_len(fname): | |
| 1507 i = 0 | |
| 1508 f = open(fname) | |
| 1509 for i, l in enumerate(f): | |
| 1510 pass | |
| 1511 f.close() | |
| 1512 return i + 1 | |
| 1513 | |
| 1514 length = _file_len(input_files[0]) | |
| 1515 parts = int(split_params['split_size']) | |
| 1516 if length < parts: | |
| 1517 parts = length | |
| 1518 len_each, remainder = divmod(length, parts) | |
| 1519 while length > 0: | |
| 1520 chunk = len_each | |
| 1521 if remainder > 0: | |
| 1522 chunk += 1 | |
| 1523 lines_per_file.append(chunk) | |
| 1524 remainder = - 1 | |
| 1525 length -= chunk | |
| 1526 elif split_params['split_mode'] == 'to_size': | |
| 1527 chunk_size = int(split_params['split_size']) | |
| 1528 else: | |
| 1529 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 1530 | |
| 1531 f = open(input_files[0], 'rt') | |
| 1532 try: | |
| 1533 chunk_idx = 0 | |
| 1534 file_done = False | |
| 1535 part_file = None | |
| 1536 while not file_done: | |
| 1537 if lines_per_file is None: | |
| 1538 this_chunk_size = chunk_size | |
| 1539 elif chunk_idx < len(lines_per_file): | |
| 1540 this_chunk_size = lines_per_file[chunk_idx] | |
| 1541 chunk_idx += 1 | |
| 1542 lines_remaining = this_chunk_size | |
| 1543 part_file = None | |
| 1544 while lines_remaining > 0: | |
| 1545 a_line = f.readline() | |
| 1546 if a_line == '': | |
| 1547 file_done = True | |
| 1548 break | |
| 1549 if part_file is None: | |
| 1550 part_dir = subdir_generator_function() | |
| 1551 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 1552 part_file = open(part_path, 'w') | |
| 1553 part_file.write(a_line) | |
| 1554 lines_remaining -= 1 | |
| 1555 if part_file is not None: | |
| 1556 part_file.close() | |
| 1557 except Exception, e: | |
| 1558 log.error('Unable to split files: %s' % str(e)) | |
| 1559 f.close() | |
| 1560 if part_file is not None: | |
| 1561 part_file.close() | |
| 1562 raise | |
| 1563 f.close() | |
| 1564 | |
| 1565 split = classmethod(split) | |
| 1566 | |
| 1567 | |
| 1568 class msa(data.Data): | |
| 1569 file_ext = 'msa' | |
| 1570 line_class = 'line' | |
| 1571 | |
| 1572 """Add metadata elements""" | |
| 1573 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 1574 visible=False, no_value=0) | |
| 1575 | |
| 1576 def write_from_stream(self, dataset, stream): | |
| 1577 """Writes data from a stream""" | |
| 1578 # write it twice for now | |
| 1579 fd, temp_name = tempfile.mkstemp() | |
| 1580 while 1: | |
| 1581 chunk = stream.read(1048576) | |
| 1582 if not chunk: | |
| 1583 break | |
| 1584 os.write(fd, chunk) | |
| 1585 os.close(fd) | |
| 1586 # rewrite the file with unix newlines | |
| 1587 fp = open(dataset.file_name, 'wt') | |
| 1588 for line in file(temp_name, "U"): | |
| 1589 line = line.strip() + '\n' | |
| 1590 fp.write(line) | |
| 1591 fp.close() | |
| 1592 | |
| 1593 def set_raw_data(self, dataset, data): | |
| 1594 """Saves the data on the disc""" | |
| 1595 fd, temp_name = tempfile.mkstemp() | |
| 1596 os.write(fd, data) | |
| 1597 os.close(fd) | |
| 1598 # rewrite the file with unix newlines | |
| 1599 fp = open(dataset.file_name, 'wt') | |
| 1600 for line in file(temp_name, "U"): | |
| 1601 line = line.strip() + '\n' | |
| 1602 fp.write(line) | |
| 1603 fp.close() | |
| 1604 os.remove(temp_name) | |
| 1605 | |
| 1606 def get_mime(self): | |
| 1607 """Returns the mime type of the datatype""" | |
| 1608 return 'text/plain' | |
| 1609 | |
| 1610 def set_meta(self, dataset, **kwd): | |
| 1611 """ | |
| 1612 Set the number of lines of data in dataset. | |
| 1613 """ | |
| 1614 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 1615 | |
| 1616 def estimate_file_lines(self, dataset): | |
| 1617 """ | |
| 1618 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 1619 """ | |
| 1620 sample_size = 1048576 | |
| 1621 dataset_fh = open(dataset.file_name) | |
| 1622 dataset_read = dataset_fh.read(sample_size) | |
| 1623 dataset_fh.close() | |
| 1624 sample_lines = dataset_read.count('\n') | |
| 1625 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 1626 return est_lines | |
| 1627 | |
| 1628 def count_data_lines(self, dataset): | |
| 1629 """ | |
| 1630 Count the number of lines of data in dataset, | |
| 1631 skipping all blank lines and comments. | |
| 1632 """ | |
| 1633 data_lines = 0 | |
| 1634 for line in file(dataset.file_name): | |
| 1635 line = line.strip() | |
| 1636 if line and not line.startswith('#'): | |
| 1637 data_lines += 1 | |
| 1638 return data_lines | |
| 1639 | |
| 1640 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 1641 """ | |
| 1642 Set the peek. This method is used by various subclasses of Text. | |
| 1643 """ | |
| 1644 if not dataset.dataset.purged: | |
| 1645 # The file must exist on disk for the get_file_peek() method | |
| 1646 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 1647 skipchars=skipchars) | |
| 1648 if line_count is None: | |
| 1649 # See if line_count is stored in the metadata | |
| 1650 if dataset.metadata.data_lines: | |
| 1651 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 1652 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 1653 else: | |
| 1654 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 1655 # needed to set metadata | |
| 1656 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 1657 if int(dataset.get_size()) <= 1048576: | |
| 1658 # Small dataset, recount all lines and reset peek afterward. | |
| 1659 lc = self.count_data_lines(dataset) | |
| 1660 dataset.metadata.data_lines = lc | |
| 1661 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 1662 else: | |
| 1663 est_lines = self.estimate_file_lines(dataset) | |
| 1664 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 1665 inflector.cond_plural(est_lines, self.line_class) ) | |
| 1666 else: | |
| 1667 dataset.blurb = "%s %s" % ( | |
| 1668 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 1669 else: | |
| 1670 dataset.peek = 'file does not exist' | |
| 1671 dataset.blurb = 'file purged from disk' | |
| 1672 | |
| 1673 def sniff(self, filename): | |
| 1674 """All msa Files simply put a '# .msa' in the first line. """ | |
| 1675 try: | |
| 1676 f = open(filename, "r") | |
| 1677 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
| 1678 f.close() | |
| 1679 | |
| 1680 if "# .MSA" in firstline: | |
| 1681 return True | |
| 1682 else: | |
| 1683 return False | |
| 1684 except: | |
| 1685 traceback.print_exc(file=sys.stdout) | |
| 1686 return False | |
| 1687 | |
| 1688 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 1689 """ | |
| 1690 Split the input files by line. | |
| 1691 """ | |
| 1692 if split_params is None: | |
| 1693 return | |
| 1694 | |
| 1695 if len(input_datasets) > 1: | |
| 1696 raise Exception("Text file splitting does not support multiple files") | |
| 1697 input_files = [ds.file_name for ds in input_datasets] | |
| 1698 | |
| 1699 lines_per_file = None | |
| 1700 chunk_size = None | |
| 1701 if split_params['split_mode'] == 'number_of_parts': | |
| 1702 lines_per_file = [] | |
| 1703 # Computing the length is expensive! | |
| 1704 def _file_len(fname): | |
| 1705 i = 0 | |
| 1706 f = open(fname) | |
| 1707 for i, l in enumerate(f): | |
| 1708 pass | |
| 1709 f.close() | |
| 1710 return i + 1 | |
| 1711 | |
| 1712 length = _file_len(input_files[0]) | |
| 1713 parts = int(split_params['split_size']) | |
| 1714 if length < parts: | |
| 1715 parts = length | |
| 1716 len_each, remainder = divmod(length, parts) | |
| 1717 while length > 0: | |
| 1718 chunk = len_each | |
| 1719 if remainder > 0: | |
| 1720 chunk += 1 | |
| 1721 lines_per_file.append(chunk) | |
| 1722 remainder = - 1 | |
| 1723 length -= chunk | |
| 1724 elif split_params['split_mode'] == 'to_size': | |
| 1725 chunk_size = int(split_params['split_size']) | |
| 1726 else: | |
| 1727 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 1728 | |
| 1729 f = open(input_files[0], 'rt') | |
| 1730 try: | |
| 1731 chunk_idx = 0 | |
| 1732 file_done = False | |
| 1733 part_file = None | |
| 1734 while not file_done: | |
| 1735 if lines_per_file is None: | |
| 1736 this_chunk_size = chunk_size | |
| 1737 elif chunk_idx < len(lines_per_file): | |
| 1738 this_chunk_size = lines_per_file[chunk_idx] | |
| 1739 chunk_idx += 1 | |
| 1740 lines_remaining = this_chunk_size | |
| 1741 part_file = None | |
| 1742 while lines_remaining > 0: | |
| 1743 a_line = f.readline() | |
| 1744 if a_line == '': | |
| 1745 file_done = True | |
| 1746 break | |
| 1747 if part_file is None: | |
| 1748 part_dir = subdir_generator_function() | |
| 1749 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 1750 part_file = open(part_path, 'w') | |
| 1751 part_file.write(a_line) | |
| 1752 lines_remaining -= 1 | |
| 1753 if part_file is not None: | |
| 1754 part_file.close() | |
| 1755 except Exception, e: | |
| 1756 log.error('Unable to split files: %s' % str(e)) | |
| 1757 f.close() | |
| 1758 if part_file is not None: | |
| 1759 part_file.close() | |
| 1760 raise | |
| 1761 f.close() | |
| 1762 | |
| 1763 split = classmethod(split) | |
| 1764 | |
| 1765 | |
| 1766 class wurcs(data.Data): | |
| 1767 file_ext = 'wurcs' | |
| 1768 line_class = 'line' | |
| 1769 | |
| 1770 """Add metadata elements""" | |
| 1771 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
| 1772 visible=False, no_value=0) | |
| 1773 | |
| 1774 def write_from_stream(self, dataset, stream): | |
| 1775 """Writes data from a stream""" | |
| 1776 # write it twice for now | |
| 1777 fd, temp_name = tempfile.mkstemp() | |
| 1778 while 1: | |
| 1779 chunk = stream.read(1048576) | |
| 1780 if not chunk: | |
| 1781 break | |
| 1782 os.write(fd, chunk) | |
| 1783 os.close(fd) | |
| 1784 # rewrite the file with unix newlines | |
| 1785 fp = open(dataset.file_name, 'wt') | |
| 1786 for line in file(temp_name, "U"): | |
| 1787 line = line.strip() + '\n' | |
| 1788 fp.write(line) | |
| 1789 fp.close() | |
| 1790 | |
| 1791 def set_raw_data(self, dataset, data): | |
| 1792 """Saves the data on the disc""" | |
| 1793 fd, temp_name = tempfile.mkstemp() | |
| 1794 os.write(fd, data) | |
| 1795 os.close(fd) | |
| 1796 # rewrite the file with unix newlines | |
| 1797 fp = open(dataset.file_name, 'wt') | |
| 1798 for line in file(temp_name, "U"): | |
| 1799 line = line.strip() + '\n' | |
| 1800 fp.write(line) | |
| 1801 fp.close() | |
| 1802 os.remove(temp_name) | |
| 1803 | |
| 1804 def get_mime(self): | |
| 1805 """Returns the mime type of the datatype""" | |
| 1806 return 'text/plain' | |
| 1807 | |
| 1808 def set_meta(self, dataset, **kwd): | |
| 1809 """ | |
| 1810 Set the number of lines of data in dataset. | |
| 1811 """ | |
| 1812 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
| 1813 | |
| 1814 def estimate_file_lines(self, dataset): | |
| 1815 """ | |
| 1816 Perform a rough estimate by extrapolating number of lines from a small read. | |
| 1817 """ | |
| 1818 sample_size = 1048576 | |
| 1819 dataset_fh = open(dataset.file_name) | |
| 1820 dataset_read = dataset_fh.read(sample_size) | |
| 1821 dataset_fh.close() | |
| 1822 sample_lines = dataset_read.count('\n') | |
| 1823 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
| 1824 return est_lines | |
| 1825 | |
| 1826 def count_data_lines(self, dataset): | |
| 1827 """ | |
| 1828 Count the number of lines of data in dataset, | |
| 1829 skipping all blank lines and comments. | |
| 1830 """ | |
| 1831 data_lines = 0 | |
| 1832 for line in file(dataset.file_name): | |
| 1833 line = line.strip() | |
| 1834 if line and not line.startswith('#'): | |
| 1835 data_lines += 1 | |
| 1836 return data_lines | |
| 1837 | |
| 1838 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
| 1839 """ | |
| 1840 Set the peek. This method is used by various subclasses of Text. | |
| 1841 """ | |
| 1842 if not dataset.dataset.purged: | |
| 1843 # The file must exist on disk for the get_file_peek() method | |
| 1844 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
| 1845 skipchars=skipchars) | |
| 1846 if line_count is None: | |
| 1847 # See if line_count is stored in the metadata | |
| 1848 if dataset.metadata.data_lines: | |
| 1849 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
| 1850 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
| 1851 else: | |
| 1852 # Number of lines is not known ( this should not happen ), and auto-detect is | |
| 1853 # needed to set metadata | |
| 1854 # This can happen when the file is larger than max_optional_metadata_filesize. | |
| 1855 if int(dataset.get_size()) <= 1048576: | |
| 1856 # Small dataset, recount all lines and reset peek afterward. | |
| 1857 lc = self.count_data_lines(dataset) | |
| 1858 dataset.metadata.data_lines = lc | |
| 1859 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
| 1860 else: | |
| 1861 est_lines = self.estimate_file_lines(dataset) | |
| 1862 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
| 1863 inflector.cond_plural(est_lines, self.line_class) ) | |
| 1864 else: | |
| 1865 dataset.blurb = "%s %s" % ( | |
| 1866 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
| 1867 else: | |
| 1868 dataset.peek = 'file does not exist' | |
| 1869 dataset.blurb = 'file purged from disk' | |
| 1870 | |
| 1871 def sniff(self, filename): | |
| 1872 """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/ | |
| 1873 WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1""" | |
| 1874 try: | |
| 1875 f = open(filename, "r") | |
| 1876 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
| 1877 f.close() | |
| 1878 if "WURCS" in firstline: | |
| 1879 return True | |
| 1880 else: | |
| 1881 return False | |
| 1882 except: | |
| 1883 traceback.print_exc(file=sys.stdout) | |
| 1884 return False | |
| 1885 | |
| 1886 | |
| 1887 def split(cls, input_datasets, subdir_generator_function, split_params): | |
| 1888 """ | |
| 1889 Split the input files by line. | |
| 1890 """ | |
| 1891 if split_params is None: | |
| 1892 return | |
| 1893 | |
| 1894 if len(input_datasets) > 1: | |
| 1895 raise Exception("Text file splitting does not support multiple files") | |
| 1896 input_files = [ds.file_name for ds in input_datasets] | |
| 1897 | |
| 1898 lines_per_file = None | |
| 1899 chunk_size = None | |
| 1900 if split_params['split_mode'] == 'number_of_parts': | |
| 1901 lines_per_file = [] | |
| 1902 # Computing the length is expensive! | |
| 1903 def _file_len(fname): | |
| 1904 i = 0 | |
| 1905 f = open(fname) | |
| 1906 for i, l in enumerate(f): | |
| 1907 pass | |
| 1908 f.close() | |
| 1909 return i + 1 | |
| 1910 | |
| 1911 length = _file_len(input_files[0]) | |
| 1912 parts = int(split_params['split_size']) | |
| 1913 if length < parts: | |
| 1914 parts = length | |
| 1915 len_each, remainder = divmod(length, parts) | |
| 1916 while length > 0: | |
| 1917 chunk = len_each | |
| 1918 if remainder > 0: | |
| 1919 chunk += 1 | |
| 1920 lines_per_file.append(chunk) | |
| 1921 remainder = - 1 | |
| 1922 length -= chunk | |
| 1923 elif split_params['split_mode'] == 'to_size': | |
| 1924 chunk_size = int(split_params['split_size']) | |
| 1925 else: | |
| 1926 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
| 1927 | |
| 1928 f = open(input_files[0], 'rt') | |
| 1929 try: | |
| 1930 chunk_idx = 0 | |
| 1931 file_done = False | |
| 1932 part_file = None | |
| 1933 while not file_done: | |
| 1934 if lines_per_file is None: | |
| 1935 this_chunk_size = chunk_size | |
| 1936 elif chunk_idx < len(lines_per_file): | |
| 1937 this_chunk_size = lines_per_file[chunk_idx] | |
| 1938 chunk_idx += 1 | |
| 1939 lines_remaining = this_chunk_size | |
| 1940 part_file = None | |
| 1941 while lines_remaining > 0: | |
| 1942 a_line = f.readline() | |
| 1943 if a_line == '': | |
| 1944 file_done = True | |
| 1945 break | |
| 1946 if part_file is None: | |
| 1947 part_dir = subdir_generator_function() | |
| 1948 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
| 1949 part_file = open(part_path, 'w') | |
| 1950 part_file.write(a_line) | |
| 1951 lines_remaining -= 1 | |
| 1952 if part_file is not None: | |
| 1953 part_file.close() | |
| 1954 except Exception, e: | |
| 1955 log.error('Unable to split files: %s' % str(e)) | |
| 1956 f.close() | |
| 1957 if part_file is not None: | |
| 1958 part_file.close() | |
| 1959 raise | |
| 1960 f.close() | |
| 1961 | |
| 1962 split = classmethod(split) | |
| 1963 | |
| 1964 |
