Mercurial > repos > iuc > msa_datatypes
changeset 0:70227007b991 draft default tip
Imported from capsule None
author | iuc |
---|---|
date | Tue, 22 Apr 2014 13:55:42 -0400 |
parents | |
children | |
files | datatypes_conf.xml msa.py readme.rst test-data/1.stockholm |
diffstat | 4 files changed, 210 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Tue Apr 22 13:55:42 2014 -0400 @@ -0,0 +1,18 @@ +<?xml version="1.0"?> + <datatypes> + <datatype_files> + <datatype_file name="msa.py"/> + </datatype_files> + <registration> + <datatype extension="stockholm" type="galaxy.datatypes.msa:Stockholm_1_0" display_in_upload="True" /> + <datatype extension="selex" type="galaxy.datatypes.data:Text" subclass="True"/> + <datatype extension="clustal" type="galaxy.datatypes.data:Text" subclass="True"/> + <datatype extension="msf" type="galaxy.datatypes.data:Text" subclass="True"/> + <!-- PHYLIP interleaved alignment format --> + <datatype extension="phylip" type="galaxy.datatypes.data:Text" subclass="True"/> + <datatype extension="vienna" type="galaxy.datatypes.data:Text" subclass="True"/> + </registration> + <sniffers> + <sniffer type="galaxy.datatypes.infernal:Stockholm_1_0"/> + </sniffers> +</datatypes>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/msa.py Tue Apr 22 13:55:42 2014 -0400 @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +from galaxy.datatypes.data import Text +from galaxy.datatypes.sniff import get_headers, get_test_fname +from galaxy.datatypes.data import get_file_peek +import subprocess +import os + +from galaxy.datatypes.metadata import MetadataElement +from galaxy.datatypes import metadata + +def count_special_lines( word, filename, invert = False ): + """ + searching for special 'words' using the grep tool + grep is used to speed up the searching and counting + The number of hits is returned. + """ + try: + cmd = ["grep", "-c"] + if invert: + cmd.append('-v') + cmd.extend([word, filename]) + out = subprocess.Popen(cmd, stdout=subprocess.PIPE) + return int(out.communicate()[0].split()[0]) + except: + pass + return 0 + +class Stockholm_1_0( Text ): + file_ext = "stockholm" + + MetadataElement( name="number_of_alignments", default=0, desc="Number of multiple alignments", readonly=True, visible=True, optional=True, no_value=0 ) + + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + if (dataset.metadata.number_of_models == 1): + dataset.blurb = "1 alignment" + else: + dataset.blurb = "%s alignments" % dataset.metadata.number_of_models + dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disc' + + def sniff( self, filename ): + if count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', filename) > 0: + return True + else: + return False + + def set_meta( self, dataset, **kwd ): + """ + + Set the number of models in dataset. + """ + dataset.metadata.number_of_models = count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', dataset.file_name) + + def split( cls, input_datasets, subdir_generator_function, split_params): + """ + + Split the input files by model records. + """ + if split_params is None: + return None + + if len(input_datasets) > 1: + raise Exception("STOCKHOLM-file splitting does not support multiple files") + input_files = [ds.file_name for ds in input_datasets] + + chunk_size = None + if split_params['split_mode'] == 'number_of_parts': + raise Exception('Split mode "%s" is currently not implemented for STOCKHOLM-files.' % split_params['split_mode']) + elif split_params['split_mode'] == 'to_size': + chunk_size = int(split_params['split_size']) + else: + raise Exception('Unsupported split mode %s' % split_params['split_mode']) + + def _read_stockholm_records( filename ): + lines = [] + with open(filename) as handle: + for line in handle: + lines.append( line ) + if line.strip() == '//': + yield lines + lines = [] + + def _write_part_stockholm_file( accumulated_lines ): + part_dir = subdir_generator_function() + part_path = os.path.join( part_dir, os.path.basename( input_files[0] ) ) + part_file = open( part_path, 'w' ) + part_file.writelines( accumulated_lines ) + part_file.close() + + try: + + stockholm_records = _read_stockholm_records( input_files[0] ) + stockholm_lines_accumulated = [] + for counter, stockholm_record in enumerate( stockholm_records, start = 1): + stockholm_lines_accumulated.extend( stockholm_record ) + if counter % chunk_size == 0: + _write_part_stockholm_file( stockholm_lines_accumulated ) + stockholm_lines_accumulated = [] + if stockholm_lines_accumulated: + _write_part_stockholm_file( stockholm_lines_accumulated ) + except Exception, e: + log.error('Unable to split files: %s' % str(e)) + raise + split = classmethod(split) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Tue Apr 22 13:55:42 2014 -0400 @@ -0,0 +1,64 @@ +=============================================================== +Collection of Galaxy Datatypes for Multiple Sequence Alignments +=============================================================== + +That repository should be a starting point to collect all Multiple Sequence Alignment (MSA) +file formats. +Development will happen in https://github.com/bgruening/galaxytools. Feel free to contribute. + +Copyright 2013 by: + +* Bjoern Gruening + + +============ +Installation +============ + +Please install these datatypes with the Galaxy Tool Shed: + +================== +Included Datatypes +================== + +- Stockholm* +- MSF +- phylib +- vienna +- selex +- clustal + +(*) with split/merge and counting functions + + +======= +History +======= + +- v1.1.0: Initial public release + + + + +=============================== +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.stockholm Tue Apr 22 13:55:42 2014 -0400 @@ -0,0 +1,18 @@ +# STOCKHOLM 1.0 +#=GF ID UPSK +#=GF SE Predicted; Infernal +#=GF SS Published; PMID 9223489 +#=GF RN [1] +#=GF RM 9223489 +#=GF RT The role of the pseudoknot at the 3' end of turnip yellow mosaic +#=GF RT virus RNA in minus-strand synthesis by the viral RNA-dependent RNA +#=GF RT polymerase. +#=GF RA Deiman BA, Kortlever RM, Pleij CW; +#=GF RL J Virol 1997;71:5990-5996. + +AF035635.1/619-641 UGAGUUCUCGAUCUCUAAAAUCG +M24804.1/82-104 UGAGUUCUCUAUCUCUAAAAUCG +J04373.1/6212-6234 UAAGUUCUCGAUCUUUAAAAUCG +M24803.1/1-23 UAAGUUCUCGAUCUCUAAAAUCG +#=GC SS_cons .AAA....<<<<aaa....>>>> +//