gap_datatypes: datatypes/glycan.py comparison

comparison datatypes/glycan.py @ 0:0e941a69a6fa draft default tip

Uploaded

author	chrisb
date	Wed, 23 Mar 2016 14:34:50 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:0e941a69a6fa
+__license__ = "MIT"
+import logging
+from galaxy.datatypes import metadata
+import mimetypes
+import os
+import shutil
+import sys
+import traceback
+import tempfile
+import zipfile
+from cgi import escape
+from inspect import isclass
+import galaxy.util as util
+from galaxy.datatypes import data
+from galaxy.datatypes.metadata import \
+MetadataElement  # import directly to maintain ease of use in Datatype class definitions
+from galaxy.util import inflector
+from galaxy.util.bunch import Bunch
+from galaxy.util.odict import odict
+from galaxy.util.sanitize_html import sanitize_html
+from galaxy.datatypes import dataproviders
+from galaxy import eggs
+eggs.require("Paste")
+import paste
+class kcf(data.Data):
+file_ext = 'kcf'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/plain'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All KCF Files simply put a 'ENTRY' in its first line.
+This applies to all possible kcfs. In this case check
+for  'Glycan' to confirm it's a glycan """
+try:
+from suds.client import Client
+url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+client = Client(url)
+kcfresponse = client.service.DeterminingForm(file(filename, 'r').read())
+if kcfresponse.array[0] == "KCF":
+return True
+else:
+return False
+except ImportError:
+# cannot use import suds so use simple checker
+print "using KCF simple checker"
+f = open(filename, "r")
+firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+f.close()
+if "ENTRY" in firstline and "GLYCAN" in firstline:
+return True
+else:
+return False
+except Exception, e:
+# note I am not raising an error rather return False  and let another sniffer try to type this data
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+class glycoct(data.Data):
+file_ext = 'glycoct'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/plain'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All glycoct_condensed files simply put a 'RES' in its first line and a LIN later. """
+try:
+f = open(filename, "r")
+firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+lines = f.read()
+f.close()
+# if "RES" in firstline and "LIN" in lines:
+if "RES" in firstline and "LIN" in lines:
+return True
+else:
+return False
+except Exception, e:
+# note I am not raising an error rather return False  and let another sniffer try to type this data
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+# ------------- Utility methods --------------
+# nice_size used to be here, but to resolve cyclical dependencies it's been
+# moved to galaxy.util.  It belongs there anyway since it's used outside
+# datatypes.
+nice_size = util.nice_size
+def get_test_fname(fname):
+"""Returns test data filename"""
+path, name = os.path.split(__file__)
+full_path = os.path.join(path, 'test', fname)
+return full_path
+def get_file_peek(file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[]):
+"""
+Returns the first LINE_COUNT lines wrapped to WIDTH
+## >>> fname = get_test_fname('4.bed')
+## >>> get_file_peek(fname)
+## 'chr22    30128507    31828507    uc003bnx.1_cds_2_0_chr22_29227_f    0    +\n'
+"""
+# Set size for file.readline() to a negative number to force it to
+# read until either a newline or EOF.  Needed for datasets with very
+# long lines.
+if WIDTH == 'unlimited':
+WIDTH = -1
+lines = []
+count = 0
+file_type = None
+data_checked = False
+temp = open(file_name, "U")
+while count <= LINE_COUNT:
+line = temp.readline(WIDTH)
+if line and not is_multi_byte and not data_checked:
+# See if we have a compressed or binary file
+if line[0:2] == util.gzip_magic:
+file_type = 'gzipped'
+break
+else:
+for char in line:
+if ord(char) > 128:
+file_type = 'binary'
+break
+data_checked = True
+if file_type in ['gzipped', 'binary']:
+break
+skip_line = False
+for skipchar in skipchars:
+if line.startswith(skipchar):
+skip_line = True
+break
+if not skip_line:
+lines.append(line)
+count += 1
+temp.close()
+if file_type in ['gzipped', 'binary']:
+text = "%s file" % file_type
+else:
+try:
+text = unicode('\n'.join(lines), 'utf-8')
+except UnicodeDecodeError:
+text = "binary/unknown file"
+return text
+class glycoct_xml(data.Data):
+file_ext = 'glycoct_xml'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/xml'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All glycoct XML files should use the rings form determination script """
+try:
+from suds.client import Client
+url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+client = Client(url)
+response = client.service.DeterminingForm(file(filename, 'r').read())
+if response.array[0] == "GlycoCT":
+return True
+else:
+return False
+except ImportError:
+# cannot use import suds so use simple checker
+print "using glycoct XML simple checker"
+import xml.etree.cElementTree as ET
+tree = ET.parse(filename)
+root = tree.getroot()
+if root.tag == 'sugar':
+print root.tag, root.attrib
+return True
+else:
+return False
+except Exception, e:
+# note I am not raising an error rather return False  and let another sniffer try to type this data
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+class glydeii(data.Data):
+file_ext = 'glydeii'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/xml'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All GlydeII XML files should use the rings form determination script """
+try:
+from suds.client import Client
+url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+client = Client(url)
+response = client.service.DeterminingForm(file(filename, 'r').read())
+if response.array[0] == "GLYDEII":
+return True
+else:
+return False
+except ImportError:
+# cannot use import suds so use simple checker
+print "using GlydeII simple checker"
+import xml.etree.cElementTree as ET
+tree = ET.parse(filename)
+root = tree.getroot()
+if root.tag == 'GlydeII':
+print root.tag
+return True
+else:
+return False
+except Exception, e:
+# note I am not raising an error rather return False  and let another sniffer try to type this data
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+class linucs(data.Data):
+file_ext = 'linucs'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/plain'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All LINUCS files should use the rings form determination script """
+try:
+from suds.client import Client
+url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+client = Client(url)
+response = client.service.DeterminingForm(file(filename, 'r').read())
+if response.array[0] == "LINUCS":
+return True
+else:
+return False
+except ImportError:
+# cannot use import suds so use simple checker
+print "using LINUCS simple checker"
+f = open(filename, "r")
+firstline = f.readline()
+f.close()
+if "[" in firstline and "]" in firstline and "{" in firstline and "}" in firstline:
+return True
+else:
+return False
+except Exception, e:
+# note I am not raising an error rather return False  and let another sniffer try to type this data
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+class iupac(data.Data):
+file_ext = 'iupac'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/plain'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All IUPAC files should use the rings form determination script """
+try:
+from suds.client import Client
+url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+client = Client(url)
+response = client.service.DeterminingForm(file(filename, 'r').read())
+if response.array[0] == "IUPAC":
+return True
+else:
+return False
+except ImportError:
+# cannot use import suds so use simple checker
+print "using IUPAC simple checker"
+f = open(filename, "r")
+firstline = f.readline()
+f.close()
+if "[" in firstline or "]" in firstline or "(" in firstline or ")" in firstline:
+if "{" in firstline or "}" in firstline:
+return False
+else:
+return True
+else:
+return False
+except Exception, e:
+# note I am not raising an error rather return False  and let another sniffer try to type this data
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+class linearcode(data.Data):
+file_ext = 'linearcode'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/plain'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All linear code files should use the rings form determination script """
+try:
+from suds.client import Client
+url = 'http://rings.t.soka.ac.jp/axis2/services/Utilities?wsdl'
+client = Client(url)
+lcresponse = client.service.DeterminingForm(file(filename, 'r').read())
+if lcresponse.array[0] == "LinearCode":
+print "LinearCode"
+return True
+else:
+print "Unable to guess format"
+return False
+except ImportError:
+# cannot use import suds so use simple checker
+print "using LinearCode simple checker - nope it does not exist yet"
+return False
+except Exception, e:
+# note I am not raising an error rather return False  and let another sniffer try to type this data
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+class msa(data.Data):
+file_ext = 'msa'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/plain'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All msa Files simply put a '# .msa' in the first line.  """
+try:
+f = open(filename, "r")
+firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+f.close()
+if "# .MSA" in firstline:
+return True
+else:
+return False
+except:
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)
+class wurcs(data.Data):
+file_ext = 'wurcs'
+line_class = 'line'
+"""Add metadata elements"""
+MetadataElement(name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True,
+visible=False, no_value=0)
+def write_from_stream(self, dataset, stream):
+"""Writes data from a stream"""
+# write it twice for now
+fd, temp_name = tempfile.mkstemp()
+while 1:
+chunk = stream.read(1048576)
+if not chunk:
+break
+os.write(fd, chunk)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+def set_raw_data(self, dataset, data):
+"""Saves the data on the disc"""
+fd, temp_name = tempfile.mkstemp()
+os.write(fd, data)
+os.close(fd)
+# rewrite the file with unix newlines
+fp = open(dataset.file_name, 'wt')
+for line in file(temp_name, "U"):
+line = line.strip() + '\n'
+fp.write(line)
+fp.close()
+os.remove(temp_name)
+def get_mime(self):
+"""Returns the mime type of the datatype"""
+return 'text/plain'
+def set_meta(self, dataset, **kwd):
+"""
+Set the number of lines of data in dataset.
+"""
+dataset.metadata.data_lines = self.count_data_lines(dataset)
+def estimate_file_lines(self, dataset):
+"""
+Perform a rough estimate by extrapolating number of lines from a small read.
+"""
+sample_size = 1048576
+dataset_fh = open(dataset.file_name)
+dataset_read = dataset_fh.read(sample_size)
+dataset_fh.close()
+sample_lines = dataset_read.count('\n')
+est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size)))
+return est_lines
+def count_data_lines(self, dataset):
+"""
+Count the number of lines of data in dataset,
+skipping all blank lines and comments.
+"""
+data_lines = 0
+for line in file(dataset.file_name):
+line = line.strip()
+if line and not line.startswith('#'):
+data_lines += 1
+return data_lines
+def set_peek(self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[]):
+"""
+Set the peek.  This method is used by various subclasses of Text.
+"""
+if not dataset.dataset.purged:
+# The file must exist on disk for the get_file_peek() method
+dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte, WIDTH=WIDTH,
+skipchars=skipchars)
+if line_count is None:
+# See if line_count is stored in the metadata
+if dataset.metadata.data_lines:
+dataset.blurb = "%s %s" % (util.commaify(str(dataset.metadata.data_lines)),
+inflector.cond_plural(dataset.metadata.data_lines, self.line_class) )
+else:
+# Number of lines is not known ( this should not happen ), and auto-detect is
+# needed to set metadata
+# This can happen when the file is larger than max_optional_metadata_filesize.
+if int(dataset.get_size()) <= 1048576:
+# Small dataset, recount all lines and reset peek afterward.
+lc = self.count_data_lines(dataset)
+dataset.metadata.data_lines = lc
+dataset.blurb = "%s %s" % ( util.commaify(str(lc)), inflector.cond_plural(lc, self.line_class) )
+else:
+est_lines = self.estimate_file_lines(dataset)
+dataset.blurb = "~%s %s" % (util.commaify(util.roundify(str(est_lines))),
+inflector.cond_plural(est_lines, self.line_class) )
+else:
+dataset.blurb = "%s %s" % (
+util.commaify(str(line_count)), inflector.cond_plural(line_count, self.line_class) )
+else:
+dataset.peek = 'file does not exist'
+dataset.blurb = 'file purged from disk'
+def sniff(self, filename):
+"""All WURCS Files start with WURCS= then the version number. see http://www.wurcs-wg.org/definition.php and  http://rings.t.soka.ac.jp/
+WURCS=2.0/4,3/[x2112h+1:x|1,5|2*NCC/3=O|4*OSO/3=O/3=O][12122a+1:b|1,5][12112h+1:b|1,5|2*NCC/3=O|6*OSO/3=O/3=O][12122a+1:b|1,5]1+3,2+1|2+4,3+1|3+3,4+1"""
+try:
+f = open(filename, "r")
+firstline = f.readline().upper()  # note we are uppercasing here to avoid CasE SenSitIVity
+f.close()
+if "WURCS" in firstline:
+return True
+else:
+return False
+except:
+traceback.print_exc(file=sys.stdout)
+return False
+def split(cls, input_datasets, subdir_generator_function, split_params):
+"""
+Split the input files by line.
+"""
+if split_params is None:
+return
+if len(input_datasets) > 1:
+raise Exception("Text file splitting does not support multiple files")
+input_files = [ds.file_name for ds in input_datasets]
+lines_per_file = None
+chunk_size = None
+if split_params['split_mode'] == 'number_of_parts':
+lines_per_file = []
+# Computing the length is expensive!
+def _file_len(fname):
+i = 0
+f = open(fname)
+for i, l in enumerate(f):
+pass
+f.close()
+return i + 1
+length = _file_len(input_files[0])
+parts = int(split_params['split_size'])
+if length < parts:
+parts = length
+len_each, remainder = divmod(length, parts)
+while length > 0:
+chunk = len_each
+if remainder > 0:
+chunk += 1
+lines_per_file.append(chunk)
+remainder = - 1
+length -= chunk
+elif split_params['split_mode'] == 'to_size':
+chunk_size = int(split_params['split_size'])
+else:
+raise Exception('Unsupported split mode %s' % split_params['split_mode'])
+f = open(input_files[0], 'rt')
+try:
+chunk_idx = 0
+file_done = False
+part_file = None
+while not file_done:
+if lines_per_file is None:
+this_chunk_size = chunk_size
+elif chunk_idx < len(lines_per_file):
+this_chunk_size = lines_per_file[chunk_idx]
+chunk_idx += 1
+lines_remaining = this_chunk_size
+part_file = None
+while lines_remaining > 0:
+a_line = f.readline()
+if a_line == '':
+file_done = True
+break
+if part_file is None:
+part_dir = subdir_generator_function()
+part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
+part_file = open(part_path, 'w')
+part_file.write(a_line)
+lines_remaining -= 1
+if part_file is not None:
+part_file.close()
+except Exception, e:
+log.error('Unable to split files: %s' % str(e))
+f.close()
+if part_file is not None:
+part_file.close()
+raise
+f.close()
+split = classmethod(split)

Mercurial > repos > chrisb > gap_datatypes

comparison datatypes/glycan.py @ 0:0e941a69a6fa draft default tip