# HG changeset patch # User brenninc # Date 1462697057 14400 # Node ID e3b3261e549833ed7c04dde45697dcde7b07a4c1 Uploaded diff -r 000000000000 -r e3b3261e5498 data_manager/tagdust_architecture_data_manager.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/tagdust_architecture_data_manager.py Sun May 08 04:44:17 2016 -0400 @@ -0,0 +1,113 @@ +#!/usr/bin/env python + +import json +import optparse +import os.path + +def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): + data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) + data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) + data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) + return data_manager_dict + + +def get_param(name, params, default=None, check_tab=True): + value = params.get(name) + print name, value + return check_param(name, value, default=default, check_tab=check_tab) + + +def check_param(name, value, default=None, check_tab=True): + if value in [ None, '', '?' ]: + if default: + print "Using {0} for {1} as no value provided".format( default, name ) + value = default + else: + raise Exception( '{0} is not a valid {1}. You must specify a valid {1}.'.format( value, name ) ) + if check_tab and "\t" in value: + raise Exception( '{0} is not a valid {1}. It may not contain a tab because these are used as seperators by galaxy .'.format( value, name ) ) + return value + + +def createFileBasedOnHmm(hmms): + file_name = "" + bar_code = "no" + for hmm in hmms: + block = hmm["block"].strip() + if not (block[0] in ['R','O','G','B','F','S','P']): + raise Exception( "hmm block {0} is not a valid. It must start with one of ['R','O','G','B','F','S','P'].".format( block ) ) + if block[0] == 'B': + bar_code = "yes" + if block[1] != ':': + raise Exception( "hmm block {0} is not a valid. The second character must be ':'".format( block ) ) + if "\t" in hmm: + raise Exception( "hmm block {0} is not a valid. It may not contain a tab, due to galaxy using tabs as seperators".format( block ) ) + file_name = file_name + block + "_" + file_name = file_name[:-1] + ".txt" + return bar_code, file_name + + +def get_path(galaxy_tool_dir, file_name): + file_path = os.path.join(galaxy_tool_dir, "tagdust_architecture") + if os.path.exists(file_path): + if os.path.isfile(file_path): + raise Exception( "Found a file at {0}, but expecting a directory there".format( file_path ) ) + else: + os.mkdir(file_path) + return os.path.join(file_path, file_name) + + +def writeHmm(hmms, file_path): + with open( file_path, 'w' ) as output_file: + output_file.write("./tagdust") + for i, hmm in enumerate( hmms, 1 ): + output_file.write(" ") + output_file.write(str(-i)) + output_file.write(" ") + output_file.write(hmm["block"]) + output_file.write("\n") + +def main(): + + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '--data_table_name', action='store', type="string", default=None, help='path' ) + parser.add_option( '--json_output_file', action='store', type="string", default=None, help='path' ) + (options, args) = parser.parse_args() + + data_table_name = check_param("data_table_name", options.data_table_name) + json_output_file = check_param("json_output_file", options.json_output_file, check_tab=False) + + param_dict = json.loads( open( json_output_file ).read() ) + params = param_dict.get("param_dict") + print "input params:" + print params + + hmms = get_param("hmms", params) + galaxy_tool_dir = get_param("GALAXY_DATA_INDEX_DIR", params) + + data_table_entry = {} + + data_table_entry["barcode"], file_name = createFileBasedOnHmm(hmms) + data_table_entry["path"] = get_path(galaxy_tool_dir, file_name) + writeHmm(hmms, data_table_entry["path"]) + + basename = os.path.basename(data_table_entry["path"]) + filename = os.path.splitext(basename)[0] + data_table_entry["name"] = get_param("name", params, default=filename) + data_table_entry["value"] = get_param("value", params, default=data_table_entry["name"]) + data_table_entry["dbkey"] = get_param("dbkey", params, default=data_table_entry["value"]) + + data_manager_dict = {} + _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) + + print "output:" + print data_manager_dict + # save info to json file + with open( json_output_file, 'wb' ) as output_file: + output_file.write( json.dumps( data_manager_dict ) ) + output_file.write( "\n" ) + + +if __name__ == "__main__": + main() diff -r 000000000000 -r e3b3261e5498 data_manager/tagdust_architecture_data_manager.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/tagdust_architecture_data_manager.xml Sun May 08 04:44:17 2016 -0400 @@ -0,0 +1,93 @@ + + architecture creator + + tagdust_architecture_data_manager.py + --data_table_name "tagdust_architecture" + --json_output_file "${json_output_file}" + + + + + + + + + + + + + + +Adds a path to the tagdust references. + +The tool will check the path exists but NOT check that it holds the expected data type. + +If name is not provided a concatenation of hmm values is used. + +If value is not provided, the name will be used (or its default) + +If dbkey is not provided, the value will be used (or its default) + +==== + +Taken from The TagDust2 Manual http://tagdust.sourceforge.net (part of Version 2_31 download) + +Raw sequences produced by next generation sequencing (NGS) machines can contain adapter, linker, +barcode and fingerprint sequences. TagDust2 is a program to extract and correctly label the sequences +to be mapped in downstream pipelines. +TagDust allows users to specify the expected architecture of a read and converts it into a hidden +Markov model. The latter can assign sequences to a particular barcode (or index) even in the presence +of sequencing errors. Sequences not matching the architecture (primer dimers, contaminants etc.) are +automatically discarded + +TagDust requires an input file containing sequences and a user defined HMM architecture used to ex- +tract the reads. The architecture is composed of a selection of pre-defined building blocks representing +indices, barcodes, spacers and other sequences one might encounter in the raw output of a sequenced +sample. + +HMM Building Blocks + +TagDust comes with a set of pre-defined HMM building blocks. Each includes a silent state at the +beginning and end used to link blocks together. Each block is specified by a unique letter following +by a colon and some information about the sequence. + +Read +Segment modeling the read. +Code: R:N + +Optional +Segment modeling an optional single or short stretch of nucleotides. +Code: O:N + +G addition +Segment modeling the occasional addition of guanines to the reads. +(89.3% chance of a single G , 19.5% chance of 2 Gs..). +Code: G:G + +Barcode or Index +Segment modeling a set of barcode sequences. For each sequence a separate HMM is created. The +barcode sequences must be given as a comma separated list. A null model of the same length as the +barcode is automatically added and initialized to the background nucleotide frequencies. +Code: B:GTA,AAC + +Fingerprint or Unique Molecular Identifier - UMI +Segment modeling a fingerprint (or unique molecular identifiers). Insertions and deletions are by +default not allowed within a fingerprint segment. +Code: F:NNN + +Spacer +Segment modeling a pre-defined sequence. +Code: S:GTA + +Partial +This segment is used to model sequences that may only be partially present at the 5‘ or 3‘ end of +the read. The transition probabilities (orange and blue) are set automatically based on the length +distribution of exactly matching adapters. +Code: P:CCTTAA + + + + + + + diff -r 000000000000 -r e3b3261e5498 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Sun May 08 04:44:17 2016 -0400 @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff -r 000000000000 -r e3b3261e5498 tool-data/tagdust_architecture.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/tagdust_architecture.loc.sample Sun May 08 04:44:17 2016 -0400 @@ -0,0 +1,15 @@ +#This is a location file for tagdust reference fasta files + +#See tagdust reference manual for architecture recommendatons + +#If the architecure file include a barcode HMM (B:) set the nacode field to yes +#Otherwise leave is as no + +#Planemo does not handle commas in value field well so avoid these if including the file in testing. + +#file has this format (white space characters are TAB characters): +# +#value dbkey name barcode path +# + + diff -r 000000000000 -r e3b3261e5498 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sun May 08 04:44:17 2016 -0400 @@ -0,0 +1,7 @@ + + + + value, dbkey, name, barcode, path + +
+