Mercurial > repos > chrisb > gap_datatypes
comparison datatypes/glycan.py @ 0:0e941a69a6fa draft default tip
Uploaded
author | chrisb |
---|---|
date | Wed, 23 Mar 2016 14:34:50 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0e941a69a6fa |
---|---|
1 __license__ = "MIT" | |
2 | |
3 import logging | |
4 from galaxy.datatypes import metadata | |
5 import mimetypes | |
6 import os | |
7 import shutil | |
8 import sys | |
9 import traceback | |
10 import tempfile | |
11 import zipfile | |
12 from cgi import escape | |
13 from inspect import isclass | |
14 import galaxy.util as util | |
15 from galaxy.datatypes import data | |
16 from galaxy.datatypes.metadata import \ | |
17 MetadataElement # import directly to maintain ease of use in Datatype class definitions | |
18 from galaxy.util import inflector | |
19 from galaxy.util.bunch import Bunch | |
20 from galaxy.util.odict import odict | |
21 from galaxy.util.sanitize_html import sanitize_html | |
22 | |
23 from galaxy.datatypes import dataproviders | |
24 | |
25 from galaxy import eggs | |
26 | |
27 eggs.require("Paste") | |
28 import paste | |
29 | |
30 | |
31 class kcf(data.Data): | |
32 file_ext = 'kcf' | |
33 line_class = 'line' | |
34 | |
35 """Add metadata elements""" | |
36 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
37 visible=False, no_value=0) | |
38 | |
39 def write_from_stream(self, dataset, stream): | |
40 """Writes data from a stream""" | |
41 # write it twice for now | |
42 fd, temp_name = tempfile.mkstemp() | |
43 while 1: | |
44 chunk = stream.read(1048576) | |
45 if not chunk: | |
46 break | |
47 os.write(fd, chunk) | |
48 os.close(fd) | |
49 # rewrite the file with unix newlines | |
50 fp = open(dataset.file_name, 'wt') | |
51 for line in file(temp_name, "U"): | |
52 line = line.strip() + '\n' | |
53 fp.write(line) | |
54 fp.close() | |
55 | |
56 def set_raw_data(self, dataset, data): | |
57 """Saves the data on the disc""" | |
58 fd, temp_name = tempfile.mkstemp() | |
59 os.write(fd, data) | |
60 os.close(fd) | |
61 # rewrite the file with unix newlines | |
62 fp = open(dataset.file_name, 'wt') | |
63 for line in file(temp_name, "U"): | |
64 line = line.strip() + '\n' | |
65 fp.write(line) | |
66 fp.close() | |
67 os.remove(temp_name) | |
68 | |
69 def get_mime(self): | |
70 """Returns the mime type of the datatype""" | |
71 return 'text/plain' | |
72 | |
73 def set_meta(self, dataset, **kwd): | |
74 """ | |
75 Set the number of lines of data in dataset. | |
76 """ | |
77 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
78 | |
79 def estimate_file_lines(self, dataset): | |
80 """ | |
81 Perform a rough estimate by extrapolating number of lines from a small read. | |
82 """ | |
83 sample_size = 1048576 | |
84 dataset_fh = open(dataset.file_name) | |
85 dataset_read = dataset_fh.read(sample_size) | |
86 dataset_fh.close() | |
87 sample_lines = dataset_read.count('\n') | |
88 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
89 return est_lines | |
90 | |
91 def count_data_lines(self, dataset): | |
92 """ | |
93 Count the number of lines of data in dataset, | |
94 skipping all blank lines and comments. | |
95 """ | |
96 data_lines = 0 | |
97 for line in file(dataset.file_name): | |
98 line = line.strip() | |
99 if line and not line.startswith('#'): | |
100 data_lines += 1 | |
101 return data_lines | |
102 | |
103 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
104 """ | |
105 Set the peek. This method is used by various subclasses of Text. | |
106 """ | |
107 if not dataset.dataset.purged: | |
108 # The file must exist on disk for the get_file_peek() method | |
109 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
110 skipchars=skipchars) | |
111 if line_count is None: | |
112 # See if line_count is stored in the metadata | |
113 if dataset.metadata.data_lines: | |
114 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
115 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
116 else: | |
117 # Number of lines is not known ( this should not happen ), and auto-detect is | |
118 # needed to set metadata | |
119 # This can happen when the file is larger than max_optional_metadata_filesize. | |
120 if int(dataset.get_size()) <= 1048576: | |
121 # Small dataset, recount all lines and reset peek afterward. | |
122 lc = self.count_data_lines(dataset) | |
123 dataset.metadata.data_lines = lc | |
124 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
125 else: | |
126 est_lines = self.estimate_file_lines(dataset) | |
127 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
128 inflector.cond_plural(est_lines, self.line_class) ) | |
129 else: | |
130 dataset.blurb = "%s %s" % ( | |
131 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
132 else: | |
133 dataset.peek = 'file does not exist' | |
134 dataset.blurb = 'file purged from disk' | |
135 | |
136 def sniff(self, filename): | |
137 """All KCF Files simply put a 'ENTRY' in its first line. | |
138 This applies to all possible kcfs. In this case check | |
139 for 'Glycan' to confirm it's a glycan """ | |
140 try: | |
141 from suds.client import Client | |
142 | |
143 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
144 client = Client(url) | |
145 kcfresponse = client.service.DeterminingForm(file(filename, 'r').read()) | |
146 if kcfresponse.array[0] == "KCF": | |
147 return True | |
148 else: | |
149 return False | |
150 except ImportError: | |
151 # cannot use import suds so use simple checker | |
152 print "using KCF simple checker" | |
153 f = open(filename, "r") | |
154 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
155 f.close() | |
156 | |
157 if "ENTRY" in firstline and "GLYCAN" in firstline: | |
158 return True | |
159 else: | |
160 return False | |
161 except Exception, e: | |
162 # note I am not raising an error rather return False and let another sniffer try to type this data | |
163 traceback.print_exc(file=sys.stdout) | |
164 return False | |
165 | |
166 def split(cls, input_datasets, subdir_generator_function, split_params): | |
167 """ | |
168 Split the input files by line. | |
169 """ | |
170 if split_params is None: | |
171 return | |
172 | |
173 if len(input_datasets) > 1: | |
174 raise Exception("Text file splitting does not support multiple files") | |
175 input_files = [ds.file_name for ds in input_datasets] | |
176 | |
177 lines_per_file = None | |
178 chunk_size = None | |
179 if split_params['split_mode'] == 'number_of_parts': | |
180 lines_per_file = [] | |
181 # Computing the length is expensive! | |
182 def _file_len(fname): | |
183 i = 0 | |
184 f = open(fname) | |
185 for i, l in enumerate(f): | |
186 pass | |
187 f.close() | |
188 return i + 1 | |
189 | |
190 length = _file_len(input_files[0]) | |
191 parts = int(split_params['split_size']) | |
192 if length < parts: | |
193 parts = length | |
194 len_each, remainder = divmod(length, parts) | |
195 while length > 0: | |
196 chunk = len_each | |
197 if remainder > 0: | |
198 chunk += 1 | |
199 lines_per_file.append(chunk) | |
200 remainder = - 1 | |
201 length -= chunk | |
202 elif split_params['split_mode'] == 'to_size': | |
203 chunk_size = int(split_params['split_size']) | |
204 else: | |
205 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
206 | |
207 f = open(input_files[0], 'rt') | |
208 try: | |
209 chunk_idx = 0 | |
210 file_done = False | |
211 part_file = None | |
212 while not file_done: | |
213 if lines_per_file is None: | |
214 this_chunk_size = chunk_size | |
215 elif chunk_idx < len(lines_per_file): | |
216 this_chunk_size = lines_per_file[chunk_idx] | |
217 chunk_idx += 1 | |
218 lines_remaining = this_chunk_size | |
219 part_file = None | |
220 while lines_remaining > 0: | |
221 a_line = f.readline() | |
222 if a_line == '': | |
223 file_done = True | |
224 break | |
225 if part_file is None: | |
226 part_dir = subdir_generator_function() | |
227 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
228 part_file = open(part_path, 'w') | |
229 part_file.write(a_line) | |
230 lines_remaining -= 1 | |
231 if part_file is not None: | |
232 part_file.close() | |
233 except Exception, e: | |
234 log.error('Unable to split files: %s' % str(e)) | |
235 f.close() | |
236 if part_file is not None: | |
237 part_file.close() | |
238 raise | |
239 f.close() | |
240 | |
241 split = classmethod(split) | |
242 | |
243 | |
244 class glycoct(data.Data): | |
245 file_ext = 'glycoct' | |
246 line_class = 'line' | |
247 | |
248 """Add metadata elements""" | |
249 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
250 visible=False, no_value=0) | |
251 | |
252 def write_from_stream(self, dataset, stream): | |
253 """Writes data from a stream""" | |
254 # write it twice for now | |
255 fd, temp_name = tempfile.mkstemp() | |
256 while 1: | |
257 chunk = stream.read(1048576) | |
258 if not chunk: | |
259 break | |
260 os.write(fd, chunk) | |
261 os.close(fd) | |
262 # rewrite the file with unix newlines | |
263 fp = open(dataset.file_name, 'wt') | |
264 for line in file(temp_name, "U"): | |
265 line = line.strip() + '\n' | |
266 fp.write(line) | |
267 fp.close() | |
268 | |
269 def set_raw_data(self, dataset, data): | |
270 """Saves the data on the disc""" | |
271 fd, temp_name = tempfile.mkstemp() | |
272 os.write(fd, data) | |
273 os.close(fd) | |
274 # rewrite the file with unix newlines | |
275 fp = open(dataset.file_name, 'wt') | |
276 for line in file(temp_name, "U"): | |
277 line = line.strip() + '\n' | |
278 fp.write(line) | |
279 fp.close() | |
280 os.remove(temp_name) | |
281 | |
282 def get_mime(self): | |
283 """Returns the mime type of the datatype""" | |
284 return 'text/plain' | |
285 | |
286 def set_meta(self, dataset, **kwd): | |
287 """ | |
288 Set the number of lines of data in dataset. | |
289 """ | |
290 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
291 | |
292 def estimate_file_lines(self, dataset): | |
293 """ | |
294 Perform a rough estimate by extrapolating number of lines from a small read. | |
295 """ | |
296 sample_size = 1048576 | |
297 dataset_fh = open(dataset.file_name) | |
298 dataset_read = dataset_fh.read(sample_size) | |
299 dataset_fh.close() | |
300 sample_lines = dataset_read.count('\n') | |
301 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
302 return est_lines | |
303 | |
304 def count_data_lines(self, dataset): | |
305 """ | |
306 Count the number of lines of data in dataset, | |
307 skipping all blank lines and comments. | |
308 """ | |
309 data_lines = 0 | |
310 for line in file(dataset.file_name): | |
311 line = line.strip() | |
312 if line and not line.startswith('#'): | |
313 data_lines += 1 | |
314 return data_lines | |
315 | |
316 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
317 """ | |
318 Set the peek. This method is used by various subclasses of Text. | |
319 """ | |
320 if not dataset.dataset.purged: | |
321 # The file must exist on disk for the get_file_peek() method | |
322 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
323 skipchars=skipchars) | |
324 if line_count is None: | |
325 # See if line_count is stored in the metadata | |
326 if dataset.metadata.data_lines: | |
327 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
328 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
329 else: | |
330 # Number of lines is not known ( this should not happen ), and auto-detect is | |
331 # needed to set metadata | |
332 # This can happen when the file is larger than max_optional_metadata_filesize. | |
333 if int(dataset.get_size()) <= 1048576: | |
334 # Small dataset, recount all lines and reset peek afterward. | |
335 lc = self.count_data_lines(dataset) | |
336 dataset.metadata.data_lines = lc | |
337 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
338 else: | |
339 est_lines = self.estimate_file_lines(dataset) | |
340 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
341 inflector.cond_plural(est_lines, self.line_class) ) | |
342 else: | |
343 dataset.blurb = "%s %s" % ( | |
344 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
345 else: | |
346 dataset.peek = 'file does not exist' | |
347 dataset.blurb = 'file purged from disk' | |
348 | |
349 def sniff(self, filename): | |
350 """All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """ | |
351 try: | |
352 f = open(filename, "r") | |
353 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
354 lines = f.read() | |
355 f.close() | |
356 | |
357 # if "RES" in firstline and "LIN" in lines: | |
358 if "RES" in firstline and "LIN" in lines: | |
359 return True | |
360 else: | |
361 return False | |
362 except Exception, e: | |
363 # note I am not raising an error rather return False and let another sniffer try to type this data | |
364 traceback.print_exc(file=sys.stdout) | |
365 return False | |
366 | |
367 def split(cls, input_datasets, subdir_generator_function, split_params): | |
368 """ | |
369 Split the input files by line. | |
370 """ | |
371 if split_params is None: | |
372 return | |
373 | |
374 if len(input_datasets) > 1: | |
375 raise Exception("Text file splitting does not support multiple files") | |
376 input_files = [ds.file_name for ds in input_datasets] | |
377 | |
378 lines_per_file = None | |
379 chunk_size = None | |
380 if split_params['split_mode'] == 'number_of_parts': | |
381 lines_per_file = [] | |
382 # Computing the length is expensive! | |
383 def _file_len(fname): | |
384 i = 0 | |
385 f = open(fname) | |
386 for i, l in enumerate(f): | |
387 pass | |
388 f.close() | |
389 return i + 1 | |
390 | |
391 length = _file_len(input_files[0]) | |
392 parts = int(split_params['split_size']) | |
393 if length < parts: | |
394 parts = length | |
395 len_each, remainder = divmod(length, parts) | |
396 while length > 0: | |
397 chunk = len_each | |
398 if remainder > 0: | |
399 chunk += 1 | |
400 lines_per_file.append(chunk) | |
401 remainder = - 1 | |
402 length -= chunk | |
403 elif split_params['split_mode'] == 'to_size': | |
404 chunk_size = int(split_params['split_size']) | |
405 else: | |
406 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
407 | |
408 f = open(input_files[0], 'rt') | |
409 try: | |
410 chunk_idx = 0 | |
411 file_done = False | |
412 part_file = None | |
413 while not file_done: | |
414 if lines_per_file is None: | |
415 this_chunk_size = chunk_size | |
416 elif chunk_idx < len(lines_per_file): | |
417 this_chunk_size = lines_per_file[chunk_idx] | |
418 chunk_idx += 1 | |
419 lines_remaining = this_chunk_size | |
420 part_file = None | |
421 while lines_remaining > 0: | |
422 a_line = f.readline() | |
423 if a_line == '': | |
424 file_done = True | |
425 break | |
426 if part_file is None: | |
427 part_dir = subdir_generator_function() | |
428 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
429 part_file = open(part_path, 'w') | |
430 part_file.write(a_line) | |
431 lines_remaining -= 1 | |
432 if part_file is not None: | |
433 part_file.close() | |
434 except Exception, e: | |
435 log.error('Unable to split files: %s' % str(e)) | |
436 f.close() | |
437 if part_file is not None: | |
438 part_file.close() | |
439 raise | |
440 f.close() | |
441 | |
442 split = classmethod(split) | |
443 | |
444 # ------------- Utility methods -------------- | |
445 | |
446 # nice_size used to be here, but to resolve cyclical dependencies it's been | |
447 # moved to galaxy.util. It belongs there anyway since it's used outside | |
448 # datatypes. | |
449 nice_size = util.nice_size | |
450 | |
451 | |
452 def get_test_fname(fname): | |
453 """Returns test data filename""" | |
454 path, name = os.path.split(__file__) | |
455 full_path = os.path.join(path, 'test', fname) | |
456 return full_path | |
457 | |
458 | |
459 def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]): | |
460 """ | |
461 Returns the first LINE_COUNT lines wrapped to WIDTH | |
462 | |
463 ## >>> fname = get_test_fname('4.bed') | |
464 ## >>> get_file_peek(fname) | |
465 ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n' | |
466 | |
467 """ | |
468 # Set size for file.readline() to a negative number to force it to | |
469 # read until either a newline or EOF. Needed for datasets with very | |
470 # long lines. | |
471 if WIDTH == 'unlimited': | |
472 WIDTH = -1 | |
473 lines = [] | |
474 count = 0 | |
475 file_type = None | |
476 data_checked = False | |
477 temp = open(file_name, "U") | |
478 while count <= LINE_COUNT: | |
479 line = temp.readline(WIDTH) | |
480 if line and not is_multi_byte and not data_checked: | |
481 # See if we have a compressed or binary file | |
482 if line[0:2] == util.gzip_magic: | |
483 file_type = 'gzipped' | |
484 break | |
485 else: | |
486 for char in line: | |
487 if ord(char) > 128: | |
488 file_type = 'binary' | |
489 break | |
490 data_checked = True | |
491 if file_type in ['gzipped', 'binary']: | |
492 break | |
493 skip_line = False | |
494 for skipchar in skipchars: | |
495 if line.startswith(skipchar): | |
496 skip_line = True | |
497 break | |
498 if not skip_line: | |
499 lines.append(line) | |
500 count += 1 | |
501 temp.close() | |
502 if file_type in ['gzipped', 'binary']: | |
503 text = "%s file" % file_type | |
504 else: | |
505 try: | |
506 text = unicode('\n'.join(lines), 'utf-8') | |
507 except UnicodeDecodeError: | |
508 text = "binary/unknown file" | |
509 return text | |
510 | |
511 | |
512 class glycoct_xml(data.Data): | |
513 file_ext = 'glycoct_xml' | |
514 line_class = 'line' | |
515 | |
516 """Add metadata elements""" | |
517 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
518 visible=False, no_value=0) | |
519 | |
520 def write_from_stream(self, dataset, stream): | |
521 """Writes data from a stream""" | |
522 # write it twice for now | |
523 fd, temp_name = tempfile.mkstemp() | |
524 while 1: | |
525 chunk = stream.read(1048576) | |
526 if not chunk: | |
527 break | |
528 os.write(fd, chunk) | |
529 os.close(fd) | |
530 # rewrite the file with unix newlines | |
531 fp = open(dataset.file_name, 'wt') | |
532 for line in file(temp_name, "U"): | |
533 line = line.strip() + '\n' | |
534 fp.write(line) | |
535 fp.close() | |
536 | |
537 def set_raw_data(self, dataset, data): | |
538 """Saves the data on the disc""" | |
539 fd, temp_name = tempfile.mkstemp() | |
540 os.write(fd, data) | |
541 os.close(fd) | |
542 # rewrite the file with unix newlines | |
543 fp = open(dataset.file_name, 'wt') | |
544 for line in file(temp_name, "U"): | |
545 line = line.strip() + '\n' | |
546 fp.write(line) | |
547 fp.close() | |
548 os.remove(temp_name) | |
549 | |
550 def get_mime(self): | |
551 """Returns the mime type of the datatype""" | |
552 return 'text/xml' | |
553 | |
554 def set_meta(self, dataset, **kwd): | |
555 """ | |
556 Set the number of lines of data in dataset. | |
557 """ | |
558 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
559 | |
560 def estimate_file_lines(self, dataset): | |
561 """ | |
562 Perform a rough estimate by extrapolating number of lines from a small read. | |
563 """ | |
564 sample_size = 1048576 | |
565 dataset_fh = open(dataset.file_name) | |
566 dataset_read = dataset_fh.read(sample_size) | |
567 dataset_fh.close() | |
568 sample_lines = dataset_read.count('\n') | |
569 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
570 return est_lines | |
571 | |
572 def count_data_lines(self, dataset): | |
573 """ | |
574 Count the number of lines of data in dataset, | |
575 skipping all blank lines and comments. | |
576 """ | |
577 data_lines = 0 | |
578 for line in file(dataset.file_name): | |
579 line = line.strip() | |
580 if line and not line.startswith('#'): | |
581 data_lines += 1 | |
582 return data_lines | |
583 | |
584 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
585 """ | |
586 Set the peek. This method is used by various subclasses of Text. | |
587 """ | |
588 if not dataset.dataset.purged: | |
589 # The file must exist on disk for the get_file_peek() method | |
590 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
591 skipchars=skipchars) | |
592 if line_count is None: | |
593 # See if line_count is stored in the metadata | |
594 if dataset.metadata.data_lines: | |
595 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
596 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
597 else: | |
598 # Number of lines is not known ( this should not happen ), and auto-detect is | |
599 # needed to set metadata | |
600 # This can happen when the file is larger than max_optional_metadata_filesize. | |
601 if int(dataset.get_size()) <= 1048576: | |
602 # Small dataset, recount all lines and reset peek afterward. | |
603 lc = self.count_data_lines(dataset) | |
604 dataset.metadata.data_lines = lc | |
605 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
606 else: | |
607 est_lines = self.estimate_file_lines(dataset) | |
608 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
609 inflector.cond_plural(est_lines, self.line_class) ) | |
610 else: | |
611 dataset.blurb = "%s %s" % ( | |
612 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
613 else: | |
614 dataset.peek = 'file does not exist' | |
615 dataset.blurb = 'file purged from disk' | |
616 | |
617 def sniff(self, filename): | |
618 """All glycoct XML files should use the rings form determination script """ | |
619 try: | |
620 from suds.client import Client | |
621 | |
622 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
623 client = Client(url) | |
624 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
625 if response.array[0] == "GlycoCT": | |
626 return True | |
627 else: | |
628 return False | |
629 except ImportError: | |
630 # cannot use import suds so use simple checker | |
631 print "using glycoct XML simple checker" | |
632 import xml.etree.cElementTree as ET | |
633 | |
634 tree = ET.parse(filename) | |
635 root = tree.getroot() | |
636 if root.tag == 'sugar': | |
637 print root.tag, root.attrib | |
638 return True | |
639 else: | |
640 return False | |
641 except Exception, e: | |
642 # note I am not raising an error rather return False and let another sniffer try to type this data | |
643 traceback.print_exc(file=sys.stdout) | |
644 return False | |
645 | |
646 def split(cls, input_datasets, subdir_generator_function, split_params): | |
647 """ | |
648 Split the input files by line. | |
649 """ | |
650 if split_params is None: | |
651 return | |
652 | |
653 if len(input_datasets) > 1: | |
654 raise Exception("Text file splitting does not support multiple files") | |
655 input_files = [ds.file_name for ds in input_datasets] | |
656 | |
657 lines_per_file = None | |
658 chunk_size = None | |
659 if split_params['split_mode'] == 'number_of_parts': | |
660 lines_per_file = [] | |
661 # Computing the length is expensive! | |
662 def _file_len(fname): | |
663 i = 0 | |
664 f = open(fname) | |
665 for i, l in enumerate(f): | |
666 pass | |
667 f.close() | |
668 return i + 1 | |
669 | |
670 length = _file_len(input_files[0]) | |
671 parts = int(split_params['split_size']) | |
672 if length < parts: | |
673 parts = length | |
674 len_each, remainder = divmod(length, parts) | |
675 while length > 0: | |
676 chunk = len_each | |
677 if remainder > 0: | |
678 chunk += 1 | |
679 lines_per_file.append(chunk) | |
680 remainder = - 1 | |
681 length -= chunk | |
682 elif split_params['split_mode'] == 'to_size': | |
683 chunk_size = int(split_params['split_size']) | |
684 else: | |
685 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
686 | |
687 f = open(input_files[0], 'rt') | |
688 try: | |
689 chunk_idx = 0 | |
690 file_done = False | |
691 part_file = None | |
692 while not file_done: | |
693 if lines_per_file is None: | |
694 this_chunk_size = chunk_size | |
695 elif chunk_idx < len(lines_per_file): | |
696 this_chunk_size = lines_per_file[chunk_idx] | |
697 chunk_idx += 1 | |
698 lines_remaining = this_chunk_size | |
699 part_file = None | |
700 while lines_remaining > 0: | |
701 a_line = f.readline() | |
702 if a_line == '': | |
703 file_done = True | |
704 break | |
705 if part_file is None: | |
706 part_dir = subdir_generator_function() | |
707 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
708 part_file = open(part_path, 'w') | |
709 part_file.write(a_line) | |
710 lines_remaining -= 1 | |
711 if part_file is not None: | |
712 part_file.close() | |
713 except Exception, e: | |
714 log.error('Unable to split files: %s' % str(e)) | |
715 f.close() | |
716 if part_file is not None: | |
717 part_file.close() | |
718 raise | |
719 f.close() | |
720 | |
721 split = classmethod(split) | |
722 | |
723 | |
724 class glydeii(data.Data): | |
725 file_ext = 'glydeii' | |
726 line_class = 'line' | |
727 | |
728 """Add metadata elements""" | |
729 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
730 visible=False, no_value=0) | |
731 | |
732 def write_from_stream(self, dataset, stream): | |
733 """Writes data from a stream""" | |
734 # write it twice for now | |
735 fd, temp_name = tempfile.mkstemp() | |
736 while 1: | |
737 chunk = stream.read(1048576) | |
738 if not chunk: | |
739 break | |
740 os.write(fd, chunk) | |
741 os.close(fd) | |
742 # rewrite the file with unix newlines | |
743 fp = open(dataset.file_name, 'wt') | |
744 for line in file(temp_name, "U"): | |
745 line = line.strip() + '\n' | |
746 fp.write(line) | |
747 fp.close() | |
748 | |
749 def set_raw_data(self, dataset, data): | |
750 """Saves the data on the disc""" | |
751 fd, temp_name = tempfile.mkstemp() | |
752 os.write(fd, data) | |
753 os.close(fd) | |
754 # rewrite the file with unix newlines | |
755 fp = open(dataset.file_name, 'wt') | |
756 for line in file(temp_name, "U"): | |
757 line = line.strip() + '\n' | |
758 fp.write(line) | |
759 fp.close() | |
760 os.remove(temp_name) | |
761 | |
762 def get_mime(self): | |
763 """Returns the mime type of the datatype""" | |
764 return 'text/xml' | |
765 | |
766 def set_meta(self, dataset, **kwd): | |
767 """ | |
768 Set the number of lines of data in dataset. | |
769 """ | |
770 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
771 | |
772 def estimate_file_lines(self, dataset): | |
773 """ | |
774 Perform a rough estimate by extrapolating number of lines from a small read. | |
775 """ | |
776 sample_size = 1048576 | |
777 dataset_fh = open(dataset.file_name) | |
778 dataset_read = dataset_fh.read(sample_size) | |
779 dataset_fh.close() | |
780 sample_lines = dataset_read.count('\n') | |
781 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
782 return est_lines | |
783 | |
784 def count_data_lines(self, dataset): | |
785 """ | |
786 Count the number of lines of data in dataset, | |
787 skipping all blank lines and comments. | |
788 """ | |
789 data_lines = 0 | |
790 for line in file(dataset.file_name): | |
791 line = line.strip() | |
792 if line and not line.startswith('#'): | |
793 data_lines += 1 | |
794 return data_lines | |
795 | |
796 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
797 """ | |
798 Set the peek. This method is used by various subclasses of Text. | |
799 """ | |
800 if not dataset.dataset.purged: | |
801 # The file must exist on disk for the get_file_peek() method | |
802 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
803 skipchars=skipchars) | |
804 if line_count is None: | |
805 # See if line_count is stored in the metadata | |
806 if dataset.metadata.data_lines: | |
807 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
808 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
809 else: | |
810 # Number of lines is not known ( this should not happen ), and auto-detect is | |
811 # needed to set metadata | |
812 # This can happen when the file is larger than max_optional_metadata_filesize. | |
813 if int(dataset.get_size()) <= 1048576: | |
814 # Small dataset, recount all lines and reset peek afterward. | |
815 lc = self.count_data_lines(dataset) | |
816 dataset.metadata.data_lines = lc | |
817 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
818 else: | |
819 est_lines = self.estimate_file_lines(dataset) | |
820 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
821 inflector.cond_plural(est_lines, self.line_class) ) | |
822 else: | |
823 dataset.blurb = "%s %s" % ( | |
824 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
825 else: | |
826 dataset.peek = 'file does not exist' | |
827 dataset.blurb = 'file purged from disk' | |
828 | |
829 def sniff(self, filename): | |
830 """All GlydeII XML files should use the rings form determination script """ | |
831 try: | |
832 from suds.client import Client | |
833 | |
834 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
835 client = Client(url) | |
836 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
837 if response.array[0] == "GLYDEII": | |
838 return True | |
839 else: | |
840 return False | |
841 except ImportError: | |
842 # cannot use import suds so use simple checker | |
843 print "using GlydeII simple checker" | |
844 import xml.etree.cElementTree as ET | |
845 | |
846 tree = ET.parse(filename) | |
847 root = tree.getroot() | |
848 if root.tag == 'GlydeII': | |
849 print root.tag | |
850 return True | |
851 else: | |
852 return False | |
853 except Exception, e: | |
854 # note I am not raising an error rather return False and let another sniffer try to type this data | |
855 traceback.print_exc(file=sys.stdout) | |
856 return False | |
857 | |
858 def split(cls, input_datasets, subdir_generator_function, split_params): | |
859 """ | |
860 Split the input files by line. | |
861 """ | |
862 if split_params is None: | |
863 return | |
864 | |
865 if len(input_datasets) > 1: | |
866 raise Exception("Text file splitting does not support multiple files") | |
867 input_files = [ds.file_name for ds in input_datasets] | |
868 | |
869 lines_per_file = None | |
870 chunk_size = None | |
871 if split_params['split_mode'] == 'number_of_parts': | |
872 lines_per_file = [] | |
873 # Computing the length is expensive! | |
874 def _file_len(fname): | |
875 i = 0 | |
876 f = open(fname) | |
877 for i, l in enumerate(f): | |
878 pass | |
879 f.close() | |
880 return i + 1 | |
881 | |
882 length = _file_len(input_files[0]) | |
883 parts = int(split_params['split_size']) | |
884 if length < parts: | |
885 parts = length | |
886 len_each, remainder = divmod(length, parts) | |
887 while length > 0: | |
888 chunk = len_each | |
889 if remainder > 0: | |
890 chunk += 1 | |
891 lines_per_file.append(chunk) | |
892 remainder = - 1 | |
893 length -= chunk | |
894 elif split_params['split_mode'] == 'to_size': | |
895 chunk_size = int(split_params['split_size']) | |
896 else: | |
897 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
898 | |
899 f = open(input_files[0], 'rt') | |
900 try: | |
901 chunk_idx = 0 | |
902 file_done = False | |
903 part_file = None | |
904 while not file_done: | |
905 if lines_per_file is None: | |
906 this_chunk_size = chunk_size | |
907 elif chunk_idx < len(lines_per_file): | |
908 this_chunk_size = lines_per_file[chunk_idx] | |
909 chunk_idx += 1 | |
910 lines_remaining = this_chunk_size | |
911 part_file = None | |
912 while lines_remaining > 0: | |
913 a_line = f.readline() | |
914 if a_line == '': | |
915 file_done = True | |
916 break | |
917 if part_file is None: | |
918 part_dir = subdir_generator_function() | |
919 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
920 part_file = open(part_path, 'w') | |
921 part_file.write(a_line) | |
922 lines_remaining -= 1 | |
923 if part_file is not None: | |
924 part_file.close() | |
925 except Exception, e: | |
926 log.error('Unable to split files: %s' % str(e)) | |
927 f.close() | |
928 if part_file is not None: | |
929 part_file.close() | |
930 raise | |
931 f.close() | |
932 | |
933 split = classmethod(split) | |
934 | |
935 | |
936 class linucs(data.Data): | |
937 file_ext = 'linucs' | |
938 line_class = 'line' | |
939 | |
940 """Add metadata elements""" | |
941 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
942 visible=False, no_value=0) | |
943 | |
944 def write_from_stream(self, dataset, stream): | |
945 """Writes data from a stream""" | |
946 # write it twice for now | |
947 fd, temp_name = tempfile.mkstemp() | |
948 while 1: | |
949 chunk = stream.read(1048576) | |
950 if not chunk: | |
951 break | |
952 os.write(fd, chunk) | |
953 os.close(fd) | |
954 # rewrite the file with unix newlines | |
955 fp = open(dataset.file_name, 'wt') | |
956 for line in file(temp_name, "U"): | |
957 line = line.strip() + '\n' | |
958 fp.write(line) | |
959 fp.close() | |
960 | |
961 def set_raw_data(self, dataset, data): | |
962 """Saves the data on the disc""" | |
963 fd, temp_name = tempfile.mkstemp() | |
964 os.write(fd, data) | |
965 os.close(fd) | |
966 # rewrite the file with unix newlines | |
967 fp = open(dataset.file_name, 'wt') | |
968 for line in file(temp_name, "U"): | |
969 line = line.strip() + '\n' | |
970 fp.write(line) | |
971 fp.close() | |
972 os.remove(temp_name) | |
973 | |
974 def get_mime(self): | |
975 """Returns the mime type of the datatype""" | |
976 return 'text/plain' | |
977 | |
978 def set_meta(self, dataset, **kwd): | |
979 """ | |
980 Set the number of lines of data in dataset. | |
981 """ | |
982 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
983 | |
984 def estimate_file_lines(self, dataset): | |
985 """ | |
986 Perform a rough estimate by extrapolating number of lines from a small read. | |
987 """ | |
988 sample_size = 1048576 | |
989 dataset_fh = open(dataset.file_name) | |
990 dataset_read = dataset_fh.read(sample_size) | |
991 dataset_fh.close() | |
992 sample_lines = dataset_read.count('\n') | |
993 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
994 return est_lines | |
995 | |
996 def count_data_lines(self, dataset): | |
997 """ | |
998 Count the number of lines of data in dataset, | |
999 skipping all blank lines and comments. | |
1000 """ | |
1001 data_lines = 0 | |
1002 for line in file(dataset.file_name): | |
1003 line = line.strip() | |
1004 if line and not line.startswith('#'): | |
1005 data_lines += 1 | |
1006 return data_lines | |
1007 | |
1008 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
1009 """ | |
1010 Set the peek. This method is used by various subclasses of Text. | |
1011 """ | |
1012 if not dataset.dataset.purged: | |
1013 # The file must exist on disk for the get_file_peek() method | |
1014 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
1015 skipchars=skipchars) | |
1016 if line_count is None: | |
1017 # See if line_count is stored in the metadata | |
1018 if dataset.metadata.data_lines: | |
1019 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
1020 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
1021 else: | |
1022 # Number of lines is not known ( this should not happen ), and auto-detect is | |
1023 # needed to set metadata | |
1024 # This can happen when the file is larger than max_optional_metadata_filesize. | |
1025 if int(dataset.get_size()) <= 1048576: | |
1026 # Small dataset, recount all lines and reset peek afterward. | |
1027 lc = self.count_data_lines(dataset) | |
1028 dataset.metadata.data_lines = lc | |
1029 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
1030 else: | |
1031 est_lines = self.estimate_file_lines(dataset) | |
1032 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
1033 inflector.cond_plural(est_lines, self.line_class) ) | |
1034 else: | |
1035 dataset.blurb = "%s %s" % ( | |
1036 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
1037 else: | |
1038 dataset.peek = 'file does not exist' | |
1039 dataset.blurb = 'file purged from disk' | |
1040 | |
1041 def sniff(self, filename): | |
1042 """All LINUCS files should use the rings form determination script """ | |
1043 try: | |
1044 from suds.client import Client | |
1045 | |
1046 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
1047 client = Client(url) | |
1048 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
1049 if response.array[0] == "LINUCS": | |
1050 return True | |
1051 else: | |
1052 return False | |
1053 except ImportError: | |
1054 # cannot use import suds so use simple checker | |
1055 print "using LINUCS simple checker" | |
1056 | |
1057 f = open(filename, "r") | |
1058 firstline = f.readline() | |
1059 f.close() | |
1060 | |
1061 if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline: | |
1062 return True | |
1063 else: | |
1064 return False | |
1065 except Exception, e: | |
1066 # note I am not raising an error rather return False and let another sniffer try to type this data | |
1067 traceback.print_exc(file=sys.stdout) | |
1068 return False | |
1069 | |
1070 def split(cls, input_datasets, subdir_generator_function, split_params): | |
1071 """ | |
1072 Split the input files by line. | |
1073 """ | |
1074 if split_params is None: | |
1075 return | |
1076 | |
1077 if len(input_datasets) > 1: | |
1078 raise Exception("Text file splitting does not support multiple files") | |
1079 input_files = [ds.file_name for ds in input_datasets] | |
1080 | |
1081 lines_per_file = None | |
1082 chunk_size = None | |
1083 if split_params['split_mode'] == 'number_of_parts': | |
1084 lines_per_file = [] | |
1085 # Computing the length is expensive! | |
1086 def _file_len(fname): | |
1087 i = 0 | |
1088 f = open(fname) | |
1089 for i, l in enumerate(f): | |
1090 pass | |
1091 f.close() | |
1092 return i + 1 | |
1093 | |
1094 length = _file_len(input_files[0]) | |
1095 parts = int(split_params['split_size']) | |
1096 if length < parts: | |
1097 parts = length | |
1098 len_each, remainder = divmod(length, parts) | |
1099 while length > 0: | |
1100 chunk = len_each | |
1101 if remainder > 0: | |
1102 chunk += 1 | |
1103 lines_per_file.append(chunk) | |
1104 remainder = - 1 | |
1105 length -= chunk | |
1106 elif split_params['split_mode'] == 'to_size': | |
1107 chunk_size = int(split_params['split_size']) | |
1108 else: | |
1109 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
1110 | |
1111 f = open(input_files[0], 'rt') | |
1112 try: | |
1113 chunk_idx = 0 | |
1114 file_done = False | |
1115 part_file = None | |
1116 while not file_done: | |
1117 if lines_per_file is None: | |
1118 this_chunk_size = chunk_size | |
1119 elif chunk_idx < len(lines_per_file): | |
1120 this_chunk_size = lines_per_file[chunk_idx] | |
1121 chunk_idx += 1 | |
1122 lines_remaining = this_chunk_size | |
1123 part_file = None | |
1124 while lines_remaining > 0: | |
1125 a_line = f.readline() | |
1126 if a_line == '': | |
1127 file_done = True | |
1128 break | |
1129 if part_file is None: | |
1130 part_dir = subdir_generator_function() | |
1131 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
1132 part_file = open(part_path, 'w') | |
1133 part_file.write(a_line) | |
1134 lines_remaining -= 1 | |
1135 if part_file is not None: | |
1136 part_file.close() | |
1137 except Exception, e: | |
1138 log.error('Unable to split files: %s' % str(e)) | |
1139 f.close() | |
1140 if part_file is not None: | |
1141 part_file.close() | |
1142 raise | |
1143 f.close() | |
1144 | |
1145 split = classmethod(split) | |
1146 | |
1147 | |
1148 class iupac(data.Data): | |
1149 file_ext = 'iupac' | |
1150 line_class = 'line' | |
1151 | |
1152 """Add metadata elements""" | |
1153 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
1154 visible=False, no_value=0) | |
1155 | |
1156 def write_from_stream(self, dataset, stream): | |
1157 """Writes data from a stream""" | |
1158 # write it twice for now | |
1159 fd, temp_name = tempfile.mkstemp() | |
1160 while 1: | |
1161 chunk = stream.read(1048576) | |
1162 if not chunk: | |
1163 break | |
1164 os.write(fd, chunk) | |
1165 os.close(fd) | |
1166 # rewrite the file with unix newlines | |
1167 fp = open(dataset.file_name, 'wt') | |
1168 for line in file(temp_name, "U"): | |
1169 line = line.strip() + '\n' | |
1170 fp.write(line) | |
1171 fp.close() | |
1172 | |
1173 def set_raw_data(self, dataset, data): | |
1174 """Saves the data on the disc""" | |
1175 fd, temp_name = tempfile.mkstemp() | |
1176 os.write(fd, data) | |
1177 os.close(fd) | |
1178 # rewrite the file with unix newlines | |
1179 fp = open(dataset.file_name, 'wt') | |
1180 for line in file(temp_name, "U"): | |
1181 line = line.strip() + '\n' | |
1182 fp.write(line) | |
1183 fp.close() | |
1184 os.remove(temp_name) | |
1185 | |
1186 def get_mime(self): | |
1187 """Returns the mime type of the datatype""" | |
1188 return 'text/plain' | |
1189 | |
1190 def set_meta(self, dataset, **kwd): | |
1191 """ | |
1192 Set the number of lines of data in dataset. | |
1193 """ | |
1194 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
1195 | |
1196 def estimate_file_lines(self, dataset): | |
1197 """ | |
1198 Perform a rough estimate by extrapolating number of lines from a small read. | |
1199 """ | |
1200 sample_size = 1048576 | |
1201 dataset_fh = open(dataset.file_name) | |
1202 dataset_read = dataset_fh.read(sample_size) | |
1203 dataset_fh.close() | |
1204 sample_lines = dataset_read.count('\n') | |
1205 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
1206 return est_lines | |
1207 | |
1208 def count_data_lines(self, dataset): | |
1209 """ | |
1210 Count the number of lines of data in dataset, | |
1211 skipping all blank lines and comments. | |
1212 """ | |
1213 data_lines = 0 | |
1214 for line in file(dataset.file_name): | |
1215 line = line.strip() | |
1216 if line and not line.startswith('#'): | |
1217 data_lines += 1 | |
1218 return data_lines | |
1219 | |
1220 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
1221 """ | |
1222 Set the peek. This method is used by various subclasses of Text. | |
1223 """ | |
1224 if not dataset.dataset.purged: | |
1225 # The file must exist on disk for the get_file_peek() method | |
1226 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
1227 skipchars=skipchars) | |
1228 if line_count is None: | |
1229 # See if line_count is stored in the metadata | |
1230 if dataset.metadata.data_lines: | |
1231 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
1232 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
1233 else: | |
1234 # Number of lines is not known ( this should not happen ), and auto-detect is | |
1235 # needed to set metadata | |
1236 # This can happen when the file is larger than max_optional_metadata_filesize. | |
1237 if int(dataset.get_size()) <= 1048576: | |
1238 # Small dataset, recount all lines and reset peek afterward. | |
1239 lc = self.count_data_lines(dataset) | |
1240 dataset.metadata.data_lines = lc | |
1241 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
1242 else: | |
1243 est_lines = self.estimate_file_lines(dataset) | |
1244 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
1245 inflector.cond_plural(est_lines, self.line_class) ) | |
1246 else: | |
1247 dataset.blurb = "%s %s" % ( | |
1248 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
1249 else: | |
1250 dataset.peek = 'file does not exist' | |
1251 dataset.blurb = 'file purged from disk' | |
1252 | |
1253 def sniff(self, filename): | |
1254 """All IUPAC files should use the rings form determination script """ | |
1255 try: | |
1256 from suds.client import Client | |
1257 | |
1258 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
1259 client = Client(url) | |
1260 response = client.service.DeterminingForm(file(filename, 'r').read()) | |
1261 if response.array[0] == "IUPAC": | |
1262 return True | |
1263 else: | |
1264 return False | |
1265 except ImportError: | |
1266 # cannot use import suds so use simple checker | |
1267 print "using IUPAC simple checker" | |
1268 f = open(filename, "r") | |
1269 firstline = f.readline() | |
1270 f.close() | |
1271 | |
1272 if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline: | |
1273 if "{" in firstline or "}" in firstline: | |
1274 return False | |
1275 else: | |
1276 return True | |
1277 else: | |
1278 return False | |
1279 except Exception, e: | |
1280 # note I am not raising an error rather return False and let another sniffer try to type this data | |
1281 traceback.print_exc(file=sys.stdout) | |
1282 return False | |
1283 | |
1284 def split(cls, input_datasets, subdir_generator_function, split_params): | |
1285 """ | |
1286 Split the input files by line. | |
1287 """ | |
1288 if split_params is None: | |
1289 return | |
1290 | |
1291 if len(input_datasets) > 1: | |
1292 raise Exception("Text file splitting does not support multiple files") | |
1293 input_files = [ds.file_name for ds in input_datasets] | |
1294 | |
1295 lines_per_file = None | |
1296 chunk_size = None | |
1297 if split_params['split_mode'] == 'number_of_parts': | |
1298 lines_per_file = [] | |
1299 # Computing the length is expensive! | |
1300 def _file_len(fname): | |
1301 i = 0 | |
1302 f = open(fname) | |
1303 for i, l in enumerate(f): | |
1304 pass | |
1305 f.close() | |
1306 return i + 1 | |
1307 | |
1308 length = _file_len(input_files[0]) | |
1309 parts = int(split_params['split_size']) | |
1310 if length < parts: | |
1311 parts = length | |
1312 len_each, remainder = divmod(length, parts) | |
1313 while length > 0: | |
1314 chunk = len_each | |
1315 if remainder > 0: | |
1316 chunk += 1 | |
1317 lines_per_file.append(chunk) | |
1318 remainder = - 1 | |
1319 length -= chunk | |
1320 elif split_params['split_mode'] == 'to_size': | |
1321 chunk_size = int(split_params['split_size']) | |
1322 else: | |
1323 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
1324 | |
1325 f = open(input_files[0], 'rt') | |
1326 try: | |
1327 chunk_idx = 0 | |
1328 file_done = False | |
1329 part_file = None | |
1330 while not file_done: | |
1331 if lines_per_file is None: | |
1332 this_chunk_size = chunk_size | |
1333 elif chunk_idx < len(lines_per_file): | |
1334 this_chunk_size = lines_per_file[chunk_idx] | |
1335 chunk_idx += 1 | |
1336 lines_remaining = this_chunk_size | |
1337 part_file = None | |
1338 while lines_remaining > 0: | |
1339 a_line = f.readline() | |
1340 if a_line == '': | |
1341 file_done = True | |
1342 break | |
1343 if part_file is None: | |
1344 part_dir = subdir_generator_function() | |
1345 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
1346 part_file = open(part_path, 'w') | |
1347 part_file.write(a_line) | |
1348 lines_remaining -= 1 | |
1349 if part_file is not None: | |
1350 part_file.close() | |
1351 except Exception, e: | |
1352 log.error('Unable to split files: %s' % str(e)) | |
1353 f.close() | |
1354 if part_file is not None: | |
1355 part_file.close() | |
1356 raise | |
1357 f.close() | |
1358 | |
1359 split = classmethod(split) | |
1360 | |
1361 | |
1362 class linearcode(data.Data): | |
1363 file_ext = 'linearcode' | |
1364 line_class = 'line' | |
1365 | |
1366 """Add metadata elements""" | |
1367 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
1368 visible=False, no_value=0) | |
1369 | |
1370 def write_from_stream(self, dataset, stream): | |
1371 """Writes data from a stream""" | |
1372 # write it twice for now | |
1373 fd, temp_name = tempfile.mkstemp() | |
1374 while 1: | |
1375 chunk = stream.read(1048576) | |
1376 if not chunk: | |
1377 break | |
1378 os.write(fd, chunk) | |
1379 os.close(fd) | |
1380 # rewrite the file with unix newlines | |
1381 fp = open(dataset.file_name, 'wt') | |
1382 for line in file(temp_name, "U"): | |
1383 line = line.strip() + '\n' | |
1384 fp.write(line) | |
1385 fp.close() | |
1386 | |
1387 def set_raw_data(self, dataset, data): | |
1388 """Saves the data on the disc""" | |
1389 fd, temp_name = tempfile.mkstemp() | |
1390 os.write(fd, data) | |
1391 os.close(fd) | |
1392 # rewrite the file with unix newlines | |
1393 fp = open(dataset.file_name, 'wt') | |
1394 for line in file(temp_name, "U"): | |
1395 line = line.strip() + '\n' | |
1396 fp.write(line) | |
1397 fp.close() | |
1398 os.remove(temp_name) | |
1399 | |
1400 def get_mime(self): | |
1401 """Returns the mime type of the datatype""" | |
1402 return 'text/plain' | |
1403 | |
1404 def set_meta(self, dataset, **kwd): | |
1405 """ | |
1406 Set the number of lines of data in dataset. | |
1407 """ | |
1408 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
1409 | |
1410 def estimate_file_lines(self, dataset): | |
1411 """ | |
1412 Perform a rough estimate by extrapolating number of lines from a small read. | |
1413 """ | |
1414 sample_size = 1048576 | |
1415 dataset_fh = open(dataset.file_name) | |
1416 dataset_read = dataset_fh.read(sample_size) | |
1417 dataset_fh.close() | |
1418 sample_lines = dataset_read.count('\n') | |
1419 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
1420 return est_lines | |
1421 | |
1422 def count_data_lines(self, dataset): | |
1423 """ | |
1424 Count the number of lines of data in dataset, | |
1425 skipping all blank lines and comments. | |
1426 """ | |
1427 data_lines = 0 | |
1428 for line in file(dataset.file_name): | |
1429 line = line.strip() | |
1430 if line and not line.startswith('#'): | |
1431 data_lines += 1 | |
1432 return data_lines | |
1433 | |
1434 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
1435 """ | |
1436 Set the peek. This method is used by various subclasses of Text. | |
1437 """ | |
1438 if not dataset.dataset.purged: | |
1439 # The file must exist on disk for the get_file_peek() method | |
1440 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
1441 skipchars=skipchars) | |
1442 if line_count is None: | |
1443 # See if line_count is stored in the metadata | |
1444 if dataset.metadata.data_lines: | |
1445 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
1446 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
1447 else: | |
1448 # Number of lines is not known ( this should not happen ), and auto-detect is | |
1449 # needed to set metadata | |
1450 # This can happen when the file is larger than max_optional_metadata_filesize. | |
1451 if int(dataset.get_size()) <= 1048576: | |
1452 # Small dataset, recount all lines and reset peek afterward. | |
1453 lc = self.count_data_lines(dataset) | |
1454 dataset.metadata.data_lines = lc | |
1455 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
1456 else: | |
1457 est_lines = self.estimate_file_lines(dataset) | |
1458 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
1459 inflector.cond_plural(est_lines, self.line_class) ) | |
1460 else: | |
1461 dataset.blurb = "%s %s" % ( | |
1462 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
1463 else: | |
1464 dataset.peek = 'file does not exist' | |
1465 dataset.blurb = 'file purged from disk' | |
1466 | |
1467 def sniff(self, filename): | |
1468 """All linear code files should use the rings form determination script """ | |
1469 try: | |
1470 from suds.client import Client | |
1471 | |
1472 url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl' | |
1473 client = Client(url) | |
1474 lcresponse = client.service.DeterminingForm(file(filename, 'r').read()) | |
1475 if lcresponse.array[0] == "LinearCode": | |
1476 print "LinearCode" | |
1477 return True | |
1478 else: | |
1479 print "Unable to guess format" | |
1480 return False | |
1481 except ImportError: | |
1482 # cannot use import suds so use simple checker | |
1483 print "using LinearCode simple checker - nope it does not exist yet" | |
1484 return False | |
1485 except Exception, e: | |
1486 # note I am not raising an error rather return False and let another sniffer try to type this data | |
1487 traceback.print_exc(file=sys.stdout) | |
1488 return False | |
1489 | |
1490 def split(cls, input_datasets, subdir_generator_function, split_params): | |
1491 """ | |
1492 Split the input files by line. | |
1493 """ | |
1494 if split_params is None: | |
1495 return | |
1496 | |
1497 if len(input_datasets) > 1: | |
1498 raise Exception("Text file splitting does not support multiple files") | |
1499 input_files = [ds.file_name for ds in input_datasets] | |
1500 | |
1501 lines_per_file = None | |
1502 chunk_size = None | |
1503 if split_params['split_mode'] == 'number_of_parts': | |
1504 lines_per_file = [] | |
1505 # Computing the length is expensive! | |
1506 def _file_len(fname): | |
1507 i = 0 | |
1508 f = open(fname) | |
1509 for i, l in enumerate(f): | |
1510 pass | |
1511 f.close() | |
1512 return i + 1 | |
1513 | |
1514 length = _file_len(input_files[0]) | |
1515 parts = int(split_params['split_size']) | |
1516 if length < parts: | |
1517 parts = length | |
1518 len_each, remainder = divmod(length, parts) | |
1519 while length > 0: | |
1520 chunk = len_each | |
1521 if remainder > 0: | |
1522 chunk += 1 | |
1523 lines_per_file.append(chunk) | |
1524 remainder = - 1 | |
1525 length -= chunk | |
1526 elif split_params['split_mode'] == 'to_size': | |
1527 chunk_size = int(split_params['split_size']) | |
1528 else: | |
1529 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
1530 | |
1531 f = open(input_files[0], 'rt') | |
1532 try: | |
1533 chunk_idx = 0 | |
1534 file_done = False | |
1535 part_file = None | |
1536 while not file_done: | |
1537 if lines_per_file is None: | |
1538 this_chunk_size = chunk_size | |
1539 elif chunk_idx < len(lines_per_file): | |
1540 this_chunk_size = lines_per_file[chunk_idx] | |
1541 chunk_idx += 1 | |
1542 lines_remaining = this_chunk_size | |
1543 part_file = None | |
1544 while lines_remaining > 0: | |
1545 a_line = f.readline() | |
1546 if a_line == '': | |
1547 file_done = True | |
1548 break | |
1549 if part_file is None: | |
1550 part_dir = subdir_generator_function() | |
1551 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
1552 part_file = open(part_path, 'w') | |
1553 part_file.write(a_line) | |
1554 lines_remaining -= 1 | |
1555 if part_file is not None: | |
1556 part_file.close() | |
1557 except Exception, e: | |
1558 log.error('Unable to split files: %s' % str(e)) | |
1559 f.close() | |
1560 if part_file is not None: | |
1561 part_file.close() | |
1562 raise | |
1563 f.close() | |
1564 | |
1565 split = classmethod(split) | |
1566 | |
1567 | |
1568 class msa(data.Data): | |
1569 file_ext = 'msa' | |
1570 line_class = 'line' | |
1571 | |
1572 """Add metadata elements""" | |
1573 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
1574 visible=False, no_value=0) | |
1575 | |
1576 def write_from_stream(self, dataset, stream): | |
1577 """Writes data from a stream""" | |
1578 # write it twice for now | |
1579 fd, temp_name = tempfile.mkstemp() | |
1580 while 1: | |
1581 chunk = stream.read(1048576) | |
1582 if not chunk: | |
1583 break | |
1584 os.write(fd, chunk) | |
1585 os.close(fd) | |
1586 # rewrite the file with unix newlines | |
1587 fp = open(dataset.file_name, 'wt') | |
1588 for line in file(temp_name, "U"): | |
1589 line = line.strip() + '\n' | |
1590 fp.write(line) | |
1591 fp.close() | |
1592 | |
1593 def set_raw_data(self, dataset, data): | |
1594 """Saves the data on the disc""" | |
1595 fd, temp_name = tempfile.mkstemp() | |
1596 os.write(fd, data) | |
1597 os.close(fd) | |
1598 # rewrite the file with unix newlines | |
1599 fp = open(dataset.file_name, 'wt') | |
1600 for line in file(temp_name, "U"): | |
1601 line = line.strip() + '\n' | |
1602 fp.write(line) | |
1603 fp.close() | |
1604 os.remove(temp_name) | |
1605 | |
1606 def get_mime(self): | |
1607 """Returns the mime type of the datatype""" | |
1608 return 'text/plain' | |
1609 | |
1610 def set_meta(self, dataset, **kwd): | |
1611 """ | |
1612 Set the number of lines of data in dataset. | |
1613 """ | |
1614 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
1615 | |
1616 def estimate_file_lines(self, dataset): | |
1617 """ | |
1618 Perform a rough estimate by extrapolating number of lines from a small read. | |
1619 """ | |
1620 sample_size = 1048576 | |
1621 dataset_fh = open(dataset.file_name) | |
1622 dataset_read = dataset_fh.read(sample_size) | |
1623 dataset_fh.close() | |
1624 sample_lines = dataset_read.count('\n') | |
1625 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
1626 return est_lines | |
1627 | |
1628 def count_data_lines(self, dataset): | |
1629 """ | |
1630 Count the number of lines of data in dataset, | |
1631 skipping all blank lines and comments. | |
1632 """ | |
1633 data_lines = 0 | |
1634 for line in file(dataset.file_name): | |
1635 line = line.strip() | |
1636 if line and not line.startswith('#'): | |
1637 data_lines += 1 | |
1638 return data_lines | |
1639 | |
1640 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
1641 """ | |
1642 Set the peek. This method is used by various subclasses of Text. | |
1643 """ | |
1644 if not dataset.dataset.purged: | |
1645 # The file must exist on disk for the get_file_peek() method | |
1646 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
1647 skipchars=skipchars) | |
1648 if line_count is None: | |
1649 # See if line_count is stored in the metadata | |
1650 if dataset.metadata.data_lines: | |
1651 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
1652 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
1653 else: | |
1654 # Number of lines is not known ( this should not happen ), and auto-detect is | |
1655 # needed to set metadata | |
1656 # This can happen when the file is larger than max_optional_metadata_filesize. | |
1657 if int(dataset.get_size()) <= 1048576: | |
1658 # Small dataset, recount all lines and reset peek afterward. | |
1659 lc = self.count_data_lines(dataset) | |
1660 dataset.metadata.data_lines = lc | |
1661 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
1662 else: | |
1663 est_lines = self.estimate_file_lines(dataset) | |
1664 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
1665 inflector.cond_plural(est_lines, self.line_class) ) | |
1666 else: | |
1667 dataset.blurb = "%s %s" % ( | |
1668 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
1669 else: | |
1670 dataset.peek = 'file does not exist' | |
1671 dataset.blurb = 'file purged from disk' | |
1672 | |
1673 def sniff(self, filename): | |
1674 """All msa Files simply put a '# .msa' in the first line. """ | |
1675 try: | |
1676 f = open(filename, "r") | |
1677 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
1678 f.close() | |
1679 | |
1680 if "# .MSA" in firstline: | |
1681 return True | |
1682 else: | |
1683 return False | |
1684 except: | |
1685 traceback.print_exc(file=sys.stdout) | |
1686 return False | |
1687 | |
1688 def split(cls, input_datasets, subdir_generator_function, split_params): | |
1689 """ | |
1690 Split the input files by line. | |
1691 """ | |
1692 if split_params is None: | |
1693 return | |
1694 | |
1695 if len(input_datasets) > 1: | |
1696 raise Exception("Text file splitting does not support multiple files") | |
1697 input_files = [ds.file_name for ds in input_datasets] | |
1698 | |
1699 lines_per_file = None | |
1700 chunk_size = None | |
1701 if split_params['split_mode'] == 'number_of_parts': | |
1702 lines_per_file = [] | |
1703 # Computing the length is expensive! | |
1704 def _file_len(fname): | |
1705 i = 0 | |
1706 f = open(fname) | |
1707 for i, l in enumerate(f): | |
1708 pass | |
1709 f.close() | |
1710 return i + 1 | |
1711 | |
1712 length = _file_len(input_files[0]) | |
1713 parts = int(split_params['split_size']) | |
1714 if length < parts: | |
1715 parts = length | |
1716 len_each, remainder = divmod(length, parts) | |
1717 while length > 0: | |
1718 chunk = len_each | |
1719 if remainder > 0: | |
1720 chunk += 1 | |
1721 lines_per_file.append(chunk) | |
1722 remainder = - 1 | |
1723 length -= chunk | |
1724 elif split_params['split_mode'] == 'to_size': | |
1725 chunk_size = int(split_params['split_size']) | |
1726 else: | |
1727 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
1728 | |
1729 f = open(input_files[0], 'rt') | |
1730 try: | |
1731 chunk_idx = 0 | |
1732 file_done = False | |
1733 part_file = None | |
1734 while not file_done: | |
1735 if lines_per_file is None: | |
1736 this_chunk_size = chunk_size | |
1737 elif chunk_idx < len(lines_per_file): | |
1738 this_chunk_size = lines_per_file[chunk_idx] | |
1739 chunk_idx += 1 | |
1740 lines_remaining = this_chunk_size | |
1741 part_file = None | |
1742 while lines_remaining > 0: | |
1743 a_line = f.readline() | |
1744 if a_line == '': | |
1745 file_done = True | |
1746 break | |
1747 if part_file is None: | |
1748 part_dir = subdir_generator_function() | |
1749 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
1750 part_file = open(part_path, 'w') | |
1751 part_file.write(a_line) | |
1752 lines_remaining -= 1 | |
1753 if part_file is not None: | |
1754 part_file.close() | |
1755 except Exception, e: | |
1756 log.error('Unable to split files: %s' % str(e)) | |
1757 f.close() | |
1758 if part_file is not None: | |
1759 part_file.close() | |
1760 raise | |
1761 f.close() | |
1762 | |
1763 split = classmethod(split) | |
1764 | |
1765 | |
1766 class wurcs(data.Data): | |
1767 file_ext = 'wurcs' | |
1768 line_class = 'line' | |
1769 | |
1770 """Add metadata elements""" | |
1771 MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, | |
1772 visible=False, no_value=0) | |
1773 | |
1774 def write_from_stream(self, dataset, stream): | |
1775 """Writes data from a stream""" | |
1776 # write it twice for now | |
1777 fd, temp_name = tempfile.mkstemp() | |
1778 while 1: | |
1779 chunk = stream.read(1048576) | |
1780 if not chunk: | |
1781 break | |
1782 os.write(fd, chunk) | |
1783 os.close(fd) | |
1784 # rewrite the file with unix newlines | |
1785 fp = open(dataset.file_name, 'wt') | |
1786 for line in file(temp_name, "U"): | |
1787 line = line.strip() + '\n' | |
1788 fp.write(line) | |
1789 fp.close() | |
1790 | |
1791 def set_raw_data(self, dataset, data): | |
1792 """Saves the data on the disc""" | |
1793 fd, temp_name = tempfile.mkstemp() | |
1794 os.write(fd, data) | |
1795 os.close(fd) | |
1796 # rewrite the file with unix newlines | |
1797 fp = open(dataset.file_name, 'wt') | |
1798 for line in file(temp_name, "U"): | |
1799 line = line.strip() + '\n' | |
1800 fp.write(line) | |
1801 fp.close() | |
1802 os.remove(temp_name) | |
1803 | |
1804 def get_mime(self): | |
1805 """Returns the mime type of the datatype""" | |
1806 return 'text/plain' | |
1807 | |
1808 def set_meta(self, dataset, **kwd): | |
1809 """ | |
1810 Set the number of lines of data in dataset. | |
1811 """ | |
1812 dataset.metadata.data_lines = self.count_data_lines(dataset) | |
1813 | |
1814 def estimate_file_lines(self, dataset): | |
1815 """ | |
1816 Perform a rough estimate by extrapolating number of lines from a small read. | |
1817 """ | |
1818 sample_size = 1048576 | |
1819 dataset_fh = open(dataset.file_name) | |
1820 dataset_read = dataset_fh.read(sample_size) | |
1821 dataset_fh.close() | |
1822 sample_lines = dataset_read.count('\n') | |
1823 est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) | |
1824 return est_lines | |
1825 | |
1826 def count_data_lines(self, dataset): | |
1827 """ | |
1828 Count the number of lines of data in dataset, | |
1829 skipping all blank lines and comments. | |
1830 """ | |
1831 data_lines = 0 | |
1832 for line in file(dataset.file_name): | |
1833 line = line.strip() | |
1834 if line and not line.startswith('#'): | |
1835 data_lines += 1 | |
1836 return data_lines | |
1837 | |
1838 def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]): | |
1839 """ | |
1840 Set the peek. This method is used by various subclasses of Text. | |
1841 """ | |
1842 if not dataset.dataset.purged: | |
1843 # The file must exist on disk for the get_file_peek() method | |
1844 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH, | |
1845 skipchars=skipchars) | |
1846 if line_count is None: | |
1847 # See if line_count is stored in the metadata | |
1848 if dataset.metadata.data_lines: | |
1849 dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)), | |
1850 inflector.cond_plural(dataset.metadata.data_lines, self.line_class) ) | |
1851 else: | |
1852 # Number of lines is not known ( this should not happen ), and auto-detect is | |
1853 # needed to set metadata | |
1854 # This can happen when the file is larger than max_optional_metadata_filesize. | |
1855 if int(dataset.get_size()) <= 1048576: | |
1856 # Small dataset, recount all lines and reset peek afterward. | |
1857 lc = self.count_data_lines(dataset) | |
1858 dataset.metadata.data_lines = lc | |
1859 dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) ) | |
1860 else: | |
1861 est_lines = self.estimate_file_lines(dataset) | |
1862 dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))), | |
1863 inflector.cond_plural(est_lines, self.line_class) ) | |
1864 else: | |
1865 dataset.blurb = "%s %s" % ( | |
1866 util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) ) | |
1867 else: | |
1868 dataset.peek = 'file does not exist' | |
1869 dataset.blurb = 'file purged from disk' | |
1870 | |
1871 def sniff(self, filename): | |
1872 """All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and http://rings.t.soka.ac.jp/ | |
1873 WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1""" | |
1874 try: | |
1875 f = open(filename, "r") | |
1876 firstline = f.readline().upper() # note we are uppercasing here to avoid CasE SenSitIVity | |
1877 f.close() | |
1878 if "WURCS" in firstline: | |
1879 return True | |
1880 else: | |
1881 return False | |
1882 except: | |
1883 traceback.print_exc(file=sys.stdout) | |
1884 return False | |
1885 | |
1886 | |
1887 def split(cls, input_datasets, subdir_generator_function, split_params): | |
1888 """ | |
1889 Split the input files by line. | |
1890 """ | |
1891 if split_params is None: | |
1892 return | |
1893 | |
1894 if len(input_datasets) > 1: | |
1895 raise Exception("Text file splitting does not support multiple files") | |
1896 input_files = [ds.file_name for ds in input_datasets] | |
1897 | |
1898 lines_per_file = None | |
1899 chunk_size = None | |
1900 if split_params['split_mode'] == 'number_of_parts': | |
1901 lines_per_file = [] | |
1902 # Computing the length is expensive! | |
1903 def _file_len(fname): | |
1904 i = 0 | |
1905 f = open(fname) | |
1906 for i, l in enumerate(f): | |
1907 pass | |
1908 f.close() | |
1909 return i + 1 | |
1910 | |
1911 length = _file_len(input_files[0]) | |
1912 parts = int(split_params['split_size']) | |
1913 if length < parts: | |
1914 parts = length | |
1915 len_each, remainder = divmod(length, parts) | |
1916 while length > 0: | |
1917 chunk = len_each | |
1918 if remainder > 0: | |
1919 chunk += 1 | |
1920 lines_per_file.append(chunk) | |
1921 remainder = - 1 | |
1922 length -= chunk | |
1923 elif split_params['split_mode'] == 'to_size': | |
1924 chunk_size = int(split_params['split_size']) | |
1925 else: | |
1926 raise Exception('Unsupported split mode %s' % split_params['split_mode']) | |
1927 | |
1928 f = open(input_files[0], 'rt') | |
1929 try: | |
1930 chunk_idx = 0 | |
1931 file_done = False | |
1932 part_file = None | |
1933 while not file_done: | |
1934 if lines_per_file is None: | |
1935 this_chunk_size = chunk_size | |
1936 elif chunk_idx < len(lines_per_file): | |
1937 this_chunk_size = lines_per_file[chunk_idx] | |
1938 chunk_idx += 1 | |
1939 lines_remaining = this_chunk_size | |
1940 part_file = None | |
1941 while lines_remaining > 0: | |
1942 a_line = f.readline() | |
1943 if a_line == '': | |
1944 file_done = True | |
1945 break | |
1946 if part_file is None: | |
1947 part_dir = subdir_generator_function() | |
1948 part_path = os.path.join(part_dir, os.path.basename(input_files[0])) | |
1949 part_file = open(part_path, 'w') | |
1950 part_file.write(a_line) | |
1951 lines_remaining -= 1 | |
1952 if part_file is not None: | |
1953 part_file.close() | |
1954 except Exception, e: | |
1955 log.error('Unable to split files: %s' % str(e)) | |
1956 f.close() | |
1957 if part_file is not None: | |
1958 part_file.close() | |
1959 raise | |
1960 f.close() | |
1961 | |
1962 split = classmethod(split) | |
1963 | |
1964 |