# HG changeset patch
# User brenninc
# Date 1462697057 14400
# Node ID e3b3261e549833ed7c04dde45697dcde7b07a4c1
Uploaded
diff -r 000000000000 -r e3b3261e5498 data_manager/tagdust_architecture_data_manager.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/tagdust_architecture_data_manager.py Sun May 08 04:44:17 2016 -0400
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+import json
+import optparse
+import os.path
+
+def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
+ data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
+ data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
+ data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
+ return data_manager_dict
+
+
+def get_param(name, params, default=None, check_tab=True):
+ value = params.get(name)
+ print name, value
+ return check_param(name, value, default=default, check_tab=check_tab)
+
+
+def check_param(name, value, default=None, check_tab=True):
+ if value in [ None, '', '?' ]:
+ if default:
+ print "Using {0} for {1} as no value provided".format( default, name )
+ value = default
+ else:
+ raise Exception( '{0} is not a valid {1}. You must specify a valid {1}.'.format( value, name ) )
+ if check_tab and "\t" in value:
+ raise Exception( '{0} is not a valid {1}. It may not contain a tab because these are used as seperators by galaxy .'.format( value, name ) )
+ return value
+
+
+def createFileBasedOnHmm(hmms):
+ file_name = ""
+ bar_code = "no"
+ for hmm in hmms:
+ block = hmm["block"].strip()
+ if not (block[0] in ['R','O','G','B','F','S','P']):
+ raise Exception( "hmm block {0} is not a valid. It must start with one of ['R','O','G','B','F','S','P'].".format( block ) )
+ if block[0] == 'B':
+ bar_code = "yes"
+ if block[1] != ':':
+ raise Exception( "hmm block {0} is not a valid. The second character must be ':'".format( block ) )
+ if "\t" in hmm:
+ raise Exception( "hmm block {0} is not a valid. It may not contain a tab, due to galaxy using tabs as seperators".format( block ) )
+ file_name = file_name + block + "_"
+ file_name = file_name[:-1] + ".txt"
+ return bar_code, file_name
+
+
+def get_path(galaxy_tool_dir, file_name):
+ file_path = os.path.join(galaxy_tool_dir, "tagdust_architecture")
+ if os.path.exists(file_path):
+ if os.path.isfile(file_path):
+ raise Exception( "Found a file at {0}, but expecting a directory there".format( file_path ) )
+ else:
+ os.mkdir(file_path)
+ return os.path.join(file_path, file_name)
+
+
+def writeHmm(hmms, file_path):
+ with open( file_path, 'w' ) as output_file:
+ output_file.write("./tagdust")
+ for i, hmm in enumerate( hmms, 1 ):
+ output_file.write(" ")
+ output_file.write(str(-i))
+ output_file.write(" ")
+ output_file.write(hmm["block"])
+ output_file.write("\n")
+
+def main():
+
+ #Parse Command Line
+ parser = optparse.OptionParser()
+ parser.add_option( '--data_table_name', action='store', type="string", default=None, help='path' )
+ parser.add_option( '--json_output_file', action='store', type="string", default=None, help='path' )
+ (options, args) = parser.parse_args()
+
+ data_table_name = check_param("data_table_name", options.data_table_name)
+ json_output_file = check_param("json_output_file", options.json_output_file, check_tab=False)
+
+ param_dict = json.loads( open( json_output_file ).read() )
+ params = param_dict.get("param_dict")
+ print "input params:"
+ print params
+
+ hmms = get_param("hmms", params)
+ galaxy_tool_dir = get_param("GALAXY_DATA_INDEX_DIR", params)
+
+ data_table_entry = {}
+
+ data_table_entry["barcode"], file_name = createFileBasedOnHmm(hmms)
+ data_table_entry["path"] = get_path(galaxy_tool_dir, file_name)
+ writeHmm(hmms, data_table_entry["path"])
+
+ basename = os.path.basename(data_table_entry["path"])
+ filename = os.path.splitext(basename)[0]
+ data_table_entry["name"] = get_param("name", params, default=filename)
+ data_table_entry["value"] = get_param("value", params, default=data_table_entry["name"])
+ data_table_entry["dbkey"] = get_param("dbkey", params, default=data_table_entry["value"])
+
+ data_manager_dict = {}
+ _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
+
+ print "output:"
+ print data_manager_dict
+ # save info to json file
+ with open( json_output_file, 'wb' ) as output_file:
+ output_file.write( json.dumps( data_manager_dict ) )
+ output_file.write( "\n" )
+
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r e3b3261e5498 data_manager/tagdust_architecture_data_manager.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/tagdust_architecture_data_manager.xml Sun May 08 04:44:17 2016 -0400
@@ -0,0 +1,93 @@
+
+ architecture creator
+
+ tagdust_architecture_data_manager.py
+ --data_table_name "tagdust_architecture"
+ --json_output_file "${json_output_file}"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Adds a path to the tagdust references.
+
+The tool will check the path exists but NOT check that it holds the expected data type.
+
+If name is not provided a concatenation of hmm values is used.
+
+If value is not provided, the name will be used (or its default)
+
+If dbkey is not provided, the value will be used (or its default)
+
+====
+
+Taken from The TagDust2 Manual http://tagdust.sourceforge.net (part of Version 2_31 download)
+
+Raw sequences produced by next generation sequencing (NGS) machines can contain adapter, linker,
+barcode and fingerprint sequences. TagDust2 is a program to extract and correctly label the sequences
+to be mapped in downstream pipelines.
+TagDust allows users to specify the expected architecture of a read and converts it into a hidden
+Markov model. The latter can assign sequences to a particular barcode (or index) even in the presence
+of sequencing errors. Sequences not matching the architecture (primer dimers, contaminants etc.) are
+automatically discarded
+
+TagDust requires an input file containing sequences and a user defined HMM architecture used to ex-
+tract the reads. The architecture is composed of a selection of pre-defined building blocks representing
+indices, barcodes, spacers and other sequences one might encounter in the raw output of a sequenced
+sample.
+
+HMM Building Blocks
+
+TagDust comes with a set of pre-defined HMM building blocks. Each includes a silent state at the
+beginning and end used to link blocks together. Each block is specified by a unique letter following
+by a colon and some information about the sequence.
+
+Read
+Segment modeling the read.
+Code: R:N
+
+Optional
+Segment modeling an optional single or short stretch of nucleotides.
+Code: O:N
+
+G addition
+Segment modeling the occasional addition of guanines to the reads.
+(89.3% chance of a single G , 19.5% chance of 2 Gs..).
+Code: G:G
+
+Barcode or Index
+Segment modeling a set of barcode sequences. For each sequence a separate HMM is created. The
+barcode sequences must be given as a comma separated list. A null model of the same length as the
+barcode is automatically added and initialized to the background nucleotide frequencies.
+Code: B:GTA,AAC
+
+Fingerprint or Unique Molecular Identifier - UMI
+Segment modeling a fingerprint (or unique molecular identifiers). Insertions and deletions are by
+default not allowed within a fingerprint segment.
+Code: F:NNN
+
+Spacer
+Segment modeling a pre-defined sequence.
+Code: S:GTA
+
+Partial
+This segment is used to model sequences that may only be partially present at the 5‘ or 3‘ end of
+the read. The transition probabilities (orange and blue) are set automatically based on the length
+distribution of exactly matching adapters.
+Code: P:CCTTAA
+
+
+
+
+
+
+
diff -r 000000000000 -r e3b3261e5498 data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml Sun May 08 04:44:17 2016 -0400
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r e3b3261e5498 tool-data/tagdust_architecture.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/tagdust_architecture.loc.sample Sun May 08 04:44:17 2016 -0400
@@ -0,0 +1,15 @@
+#This is a location file for tagdust reference fasta files
+
+#See tagdust reference manual for architecture recommendatons
+
+#If the architecure file include a barcode HMM (B:) set the nacode field to yes
+#Otherwise leave is as no
+
+#Planemo does not handle commas in value field well so avoid these if including the file in testing.
+
+#file has this format (white space characters are TAB characters):
+#
+#value dbkey name barcode path
+#
+
+
diff -r 000000000000 -r e3b3261e5498 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Sun May 08 04:44:17 2016 -0400
@@ -0,0 +1,7 @@
+
+
+
+ value, dbkey, name, barcode, path
+
+
+