# HG changeset patch
# User yating-l
# Date 1492033315 14400
# Node ID 804a93e87cc8ef7355bb9bd567b2f414872705b9
planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f22711ea7a464bdaf4d5aaea07f2eacf967aa66e-dirty
diff -r 000000000000 -r 804a93e87cc8 TrackHub.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/TrackHub.py Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+import shutil
+import json
+import utils
+
+
+class TrackHub:
+ def __init__(self, inputFiles, reference, outputDirect, tool_dir, genome, extra_files_path, metaData, jbrowse_host):
+ self.input_files = inputFiles.tracks
+ self.outfile = outputDirect
+ self.outfolder = extra_files_path
+ self.out_path = os.path.join(extra_files_path, genome)
+ self.reference = reference
+ self.tool_dir = tool_dir
+ self.metaData = metaData
+ self.raw = os.path.join(self.out_path, 'raw')
+ self.json = os.path.join(self.out_path, 'json')
+ self.jbrowse_host = jbrowse_host
+ try:
+ if os.path.exists(self.json):
+ shutil.rmtree(self.json)
+ os.makedirs(self.json)
+ except OSError as e:
+ print "Cannot create json folder error({0}): {1}".format(e.errno, e.strerror)
+ else:
+ print "Create jbrowse folder {}".format(self.out_path)
+
+ def createHub(self):
+ self.prepareRefseq()
+ for input_file in self.input_files:
+ self.addTrack(input_file)
+ self.indexName()
+ slink = self.makeArchive()
+ self.outHtml(slink)
+ print "Success!\n"
+
+ def prepareRefseq(self):
+ try:
+ #print os.path.join(self.tool_dir, 'prepare-refseqs.pl') + ", '--fasta', " + self.reference +", '--out', self.json])"
+ subprocess.call(['prepare-refseqs.pl', '--fasta', self.reference, '--out', self.json])
+ except OSError as e:
+ print "Cannot prepare reference error({0}): {1}".format(e.errno, e.strerror)
+ #TODO: hard coded the bam and bigwig tracks. Need to allow users to customize the settings
+ def addTrack(self, track):
+ #print "false_path" , track['false_path']
+ if track['false_path'] in self.metaData.keys():
+ metadata = self.metaData[track['false_path']]
+ else:
+ metadata = {}
+ self.SetMetadata(track, metadata)
+ if track['dataType'] == 'bam':
+ self.Bam(track, metadata)
+ # print "add bam track\n"
+ elif track['dataType'] == 'bigwig':
+ self.BigWig(track, metadata)
+ else:
+ flat_file = os.path.join(self.raw, track['fileName'])
+ if track['dataType'] == 'bed':
+ subprocess.call(['flatfile-to-json.pl', '--bed', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
+ elif track['dataType'] == 'bedSpliceJunctions' or track['dataType'] == 'gtf' or track['dataType'] == 'blastxml':
+ subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"glyph": "JBrowse/View/FeatureGlyph/Segments", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
+ elif track['dataType'] == 'gff3_transcript':
+ subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"transcriptType": "transcript", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
+ else:
+ subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
+
+ def indexName(self):
+ subprocess.call(['generate-names.pl', '-v', '--out', self.json])
+ print "finished name index \n"
+
+ def makeArchive(self):
+ shutil.make_archive(self.out_path, 'zip', self.out_path)
+ file_dir = os.path.abspath(self.outfile)
+ source_dir = os.path.dirname(file_dir)
+ folder_name = os.path.basename(self.outfolder)
+ source_name = os.path.basename(self.out_path)
+ source = os.path.join(source_dir, folder_name, source_name)
+ slink = source.replace('/', '_')
+ slink = os.path.join('/var/www/html/JBrowse-1.12.1/data', slink)
+ try:
+ if os.path.islink(slink):
+ os.unlink(slink)
+ except OSError as oserror:
+ print "Cannot create symlink to the data({0}): {1}".format(oserror.errno, oserror.strerror)
+ os.symlink(source, slink)
+ return slink
+ '''
+ data_folder = '/gonramp/static/JBrowse-1.12.1/jbrowse_hub'
+ try:
+ if os.path.exists(data_folder):
+ if os.path.isdir(data_folder):
+ shutil.rmtree(data_folder)
+ else:
+ os.remove(data_folder)
+ except OSError as oserror:
+ print "Cannot create data folder({0}): {1}".format(oserror.errno, oserror.strerror)
+ shutil.copytree(self.out_path, data_folder)
+ subprocess.call(['chmod', '-R', 'o+rx', '/var/www/html/JBrowse-1.12.1/jbrowse_hub'])
+ shutil.rmtree(self.out_path)
+ '''
+
+ #TODO: this will list all zip files in the filedir and sub-dirs. worked in Galaxy but all list zip files in test-data when
+ #run it locally. May need modify
+ def outHtml(self, slink):
+ with open(self.outfile, 'w') as htmlfile:
+ htmlstr = 'The JBrowse Hub is created:
'
+ zipfiles = '
Download'
+ url = self.jbrowse_host + "/JBrowse-1.12.1/index.html?data=%s"
+ jbrowse_hub = 'View JBrowse Hub' % url
+ filedir_abs = os.path.abspath(self.outfile)
+ filedir = os.path.dirname(filedir_abs)
+ filedir = os.path.join(filedir, self.outfolder)
+ for root, dirs, files in os.walk(filedir):
+ for file in files:
+ if file.endswith('.zip'):
+ relative_directory = os.path.relpath(root, filedir)
+ relative_file_path = os.path.join(relative_directory, file)
+ htmlstr += zipfiles % relative_file_path
+ link_name = os.path.basename(slink)
+ relative_path = os.path.join('data', link_name + '/json')
+ htmlstr += jbrowse_hub % relative_path
+ htmlfile.write(htmlstr)
+
+ def createTrackList(self):
+ trackList = os.path.join(self.json, "trackList.json")
+ if not os.path.exists(trackList):
+ os.mknod(trackList)
+
+ def Bam(self, track, metadata):
+ #create trackList.json if not exist
+ self.createTrackList()
+ json_file = os.path.join(self.json, "trackList.json")
+ bam_track = dict()
+ bam_track['type'] = 'JBrowse/View/Track/Alignments2'
+ bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM'
+ bam_track['urlTemplate'] = os.path.join('../raw', track['fileName'])
+ bam_track['baiUrlTemplate'] = os.path.join('../raw', track['index'])
+ bam_track['label'] = metadata['label']
+ bam_track['category'] = metadata['category']
+ bam_track = json.dumps(bam_track)
+ #Use add-track-json.pl to add bam track to json file
+ new_track = subprocess.Popen(['echo', bam_track], stdout=subprocess.PIPE)
+ subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout)
+
+ def BigWig(self, track, metadata):
+ #create trackList.json if not exist
+ self.createTrackList()
+ json_file = os.path.join(self.json, "trackList.json")
+ bigwig_track = dict()
+ bigwig_track['urlTemplate'] = os.path.join('../raw', track['fileName'])
+ bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot'
+ bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig'
+ bigwig_track['label'] = metadata['label']
+ bigwig_track['style'] = metadata['style']
+ bigwig_track['category'] = metadata['category']
+ bigwig_track = json.dumps(bigwig_track)
+ #Use add-track-json.pl to add bigwig track to json file
+ new_track = subprocess.Popen(['echo', bigwig_track], stdout=subprocess.PIPE)
+ #output = new_track.communicate()[0]
+ subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout)
+
+ #If the metadata is not set, use the default value
+ def SetMetadata(self, track, metadata):
+ if 'label' not in metadata.keys() or metadata['label'] == '':
+ metadata['label'] = track['fileName']
+ if 'color' not in metadata.keys() or metadata['color'] == '':
+ metadata['color'] = "#daa520"
+ if track['dataType'] == 'bigwig':
+ if 'style' not in metadata.keys():
+ metadata['style'] = {}
+ if 'pos_color' not in metadata['style'] or metadata['style']['pos_color'] == '':
+ metadata['style']['pos_color'] = "#FFA600"
+ if 'neg_color' not in metadata['style'] or metadata['style']['neg_color'] == '':
+ metadata['style']['neg_color'] = "#005EFF"
+ if 'category' not in metadata.keys() or metadata['category'] == '':
+ metadata['category'] = "Default group"
+ if track['dataType'] == 'blastxml':
+ metadata['type'] = "G-OnRamp_plugin/BlastAlignment"
+ elif track['dataType'] == 'gff3_transcript' or track['dataType'] == 'gff3_mrna':
+ metadata['type'] = "G-OnRamp_plugin/GenePred"
+ else:
+ metadata['type'] = "CanvasFeatures"
+
+
+
+
+
+
+
diff -r 000000000000 -r 804a93e87cc8 bedToGff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bedToGff3.py Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+'''
+Convert BED format to gff3
+'''
+import os
+from collections import OrderedDict
+import utils
+
+class bedToGff3():
+ def __init__(self, inputBedFile, chrom_sizes, bed_type, output):
+ self.input = inputBedFile
+ #file_dir = os.path.basename(inputBedFile)
+ #print file_dir + "\n\n"
+ self.output = output
+ self.chrom_sizes = chrom_sizes
+ self.type = bed_type
+ if self.type == "trfbig":
+ self.trfbig_to_gff3()
+ if self.type == "regtools":
+ self.splicejunctions_to_gff3()
+
+ def trfbig_to_gff3(self):
+ gff3 = open(self.output, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = utils.sequence_region(self.chrom_sizes)
+ seq_regions = dict()
+ with open(self.input, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[3]
+ field['type'] = 'tandem_repeat'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = str(int(li[1]) + 1)
+ field['end'] = li[2]
+ field['score'] = li[9]
+ field['strand'] = '+'
+ field['phase'] = '.'
+ attribute['length of repeat unit'] = li[4]
+ attribute['mean number of copies of repeat'] = li[5]
+ attribute['length of consensus sequence'] = li[6]
+ attribute['percentage match'] = li[7]
+ attribute['percentage indel'] = li[8]
+ attribute['percent of a\'s in repeat unit'] = li[10]
+ attribute['percent of c\'s in repeat unit'] = li[11]
+ attribute['percent of g\'s in repeat unit'] = li[12]
+ attribute['percent of t\'s in repeat unit'] = li[13]
+ attribute['entropy'] = li[14]
+ attribute['sequence of repeat unit element'] = li[15]
+ utils.write_features(field, attribute, gff3)
+ gff3.close()
+
+
+ def splicejunctions_to_gff3(self):
+ gff3 = open(self.output, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = utils.sequence_region(self.chrom_sizes)
+ seq_regions = dict()
+ with open(self.input, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[3]
+ field['type'] = 'junction'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = int(li[1]) + 1
+ field['end'] = li[2]
+ field['score'] = li[12]
+ field['strand'] = li[5]
+ field['phase'] = '.'
+ attribute['ID'] = li[3]
+ attribute['Name'] = li[3]
+ attribute['blockcount'] = li[9]
+ attribute['blocksizes'] = li[10]
+ attribute['chromstarts'] = li[11]
+ utils.write_features(field, attribute, gff3)
+ utils.child_blocks(field, attribute, gff3)
+ gff3.close()
+
\ No newline at end of file
diff -r 000000000000 -r 804a93e87cc8 blastxmlToGff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/blastxmlToGff3.py Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+
+from Bio.Blast import NCBIXML
+from collections import OrderedDict
+import utils
+
+
+def align2cigar(hsp_query, hsp_reference):
+ """
+ Build CIGAR representation from an hsp_query
+ input:
+ hsp_query
+ hsp_sbjct
+ output:
+ CIGAR string
+ """
+ query = hsp_query
+ ref = hsp_reference
+ # preType, curType:
+ # 'M' represents match,
+ # 'I' represents insert a gap into the reference sequence,
+ # 'D' represents insert a gap into the target (delete from reference)
+ # some ideas of this algin2cigar function are coming from
+ # https://gist.github.com/ozagordi/099bdb796507da8d9426
+ prevType = 'M'
+ curType = 'M'
+ count = 0
+ cigar = []
+ num = len(query)
+ for i in range(num):
+ if query[i] == '-':
+ curType = 'D'
+ elif ref[i] == '-':
+ curType = 'I'
+ else:
+ curType = 'M'
+ if curType == prevType:
+ count += 1
+ else:
+ cigar.append('%s%d' % (prevType, count))
+ prevType = curType
+ count = 1
+ cigar.append('%s%d' % (curType, count))
+ return ' '.join(cigar)
+
+def gff3_writer(blast_records, gff3_file):
+ gff3 = open(gff3_file, 'a')
+ gff3.write("##gff-version 3\n")
+ seq_regions = dict()
+ for blast_record in blast_records:
+ query_name = blast_record.query.split(" ")[0]
+ source = blast_record.application
+ method = blast_record.matrix
+ for alignment in blast_record.alignments:
+ group = {
+ "parent_field" : OrderedDict(),
+ "parent_attribute" : OrderedDict(),
+ "alignments" : []
+ }
+ title = alignment.title.split(" ")
+ contig_name = title[len(title) - 1]
+ length = alignment.length
+ group['parent_field']['seqid'] = contig_name
+ group['parent_field']['source'] = source
+ group['parent_field']['type'] = 'match'
+ group['parent_attribute']['ID'] = contig_name + '_' + query_name
+ group['parent_attribute']['method'] = method
+ group['parent_attribute']['length'] = length
+ if contig_name not in seq_regions:
+ gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n')
+ seq_regions[contig_name] = length
+ match_num = 0
+ coords = [length, 0]
+ for hsp in alignment.hsps:
+ hsp_align = {}
+ field = OrderedDict()
+ attribute = OrderedDict()
+ ref = hsp.sbjct
+ query = hsp.query
+ field['seqid'] = contig_name
+ field['source'] = source
+ field['type'] = 'match_part'
+
+ field['start'] = hsp.sbjct_start
+ if field['start'] < coords[0]:
+ coords[0] = field['start']
+ ref_length = len(ref.replace('-', ''))
+ # if run tblastn, the actual length of reference should be multiplied by 3
+ if source.lower() == "tblastn":
+ ref_length *= 3
+ field['end'] = field['start'] + ref_length - 1
+ if field['end'] > coords[1]:
+ coords[1] = field['end']
+ field['score'] = hsp.score
+ #decide if the alignment in the same strand or reverse strand
+ #reading frame
+ # (+, +), (0, 0), (-, -) => +
+ # (+, -), (-, +) => -
+ if hsp.frame[1] * hsp.frame[0] > 0:
+ field['strand'] = '+'
+ elif hsp.frame[1] * hsp.frame[0] < 0:
+ field['strand'] = '-'
+ else:
+ if hsp.frame[0] + hsp.frame[1] >= 0:
+ field['strand'] = '+'
+ else:
+ field['strand'] = '-'
+ field['phase'] = '.'
+
+ target_start = hsp.query_start
+ target_len = len(query.replace('-', ''))
+ # if run blastx, the actual length of query should be multiplied by 3
+ if source.lower() == "blastx":
+ target_len *= 3
+ target_end = target_start + target_len -1
+ attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num)
+ attribute['Parent'] = group['parent_attribute']['ID']
+ attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end)
+ attribute['Gap'] = align2cigar(query, ref)
+ #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin
+ attribute['subject'] = hsp.sbjct
+ attribute['query'] = hsp.query
+ attribute['match'] = hsp.match
+ attribute['gaps'] = attribute['match'].count(' ')
+ similar = attribute['match'].count('+')
+ attribute['identities'] = len(attribute['match']) - similar - attribute['gaps']
+ attribute['positives'] = attribute['identities'] + similar
+ attribute['expect'] = hsp.expect
+ # show reading frame attribute only if the frame is not (0, 0)
+ attribute['frame'] = hsp.frame[1]
+ match_num += 1
+ hsp_align['field'] = field
+ hsp_align['attribute'] = attribute
+ group['alignments'].append(hsp_align)
+ group['parent_field']['start'] = coords[0]
+ group['parent_field']['end'] = coords[1]
+ group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.'
+ group['parent_attribute']['match_num'] = match_num
+ group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end']))
+ utils.write_features(group['parent_field'], group['parent_attribute'], gff3)
+ prev_end = -1
+ for align in group['alignments']:
+ overlap = ''
+ if align['field']['start'] <= prev_end:
+ overlap += str(align['field']['start']) + ',' + str(prev_end)
+ prev_end = align['field']['end']
+ align['attribute']['overlap'] = overlap
+ utils.write_features(align['field'], align['attribute'], gff3)
+ gff3.close()
+
+def blastxml2gff3(xml_file, gff3_file):
+ result_handle = open(xml_file)
+ blast_records = NCBIXML.parse(result_handle)
+ gff3_writer(blast_records, gff3_file)
+
+if __name__ == "__main__":
+ blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt")
+
diff -r 000000000000 -r 804a93e87cc8 jbrowse_hub.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jbrowse_hub.py Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import json
+import utils
+import trackObject
+import TrackHub
+
+
+
+def main(argv):
+ parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.')
+
+ # Reference genome mandatory
+ parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome (Required)')
+
+ # Genome name
+ parser.add_argument('-g', '--genome_name', help='Name of reference genome')
+
+ # Output folder
+ parser.add_argument('-o', '--out', help='output html')
+
+ # Output folder
+ parser.add_argument('-e', '--extra_files_path', help='Directory of JBrowse Hub folder')
+
+ #Tool Directory
+ parser.add_argument('-d', '--tool_directory', help='The directory of JBrowse file convertion scripts and UCSC tools')
+
+ #GFF3
+ parser.add_argument('--gff3', action='append', help='GFF3 format')
+
+ # GFF3 structure: gene->transcription->CDS
+ parser.add_argument('--gff3_transcript', action='append', help='GFF3 format for gene prediction, structure: gene->transcription->CDS')
+
+ # GFF3 structure: gene->mRNA->CDS
+ parser.add_argument('--gff3_mrna', action='append', help='GFF3 format for gene prediction, structure: gene->mRNA->CDS')
+
+ # generic BED
+ parser.add_argument('--bed', action='append', help='BED format')
+
+ # trfBig simple repeats (BED 4+12)
+ parser.add_argument('--bedSimpleRepeats', action='append', help='BED 4+12 format, using simpleRepeats.as')
+
+ # regtools (BED 12+1)
+ parser.add_argument('--bedSpliceJunctions', action='append', help='BED 12+1 format, using spliceJunctions.as')
+
+ # tblastn alignment (blastxml)
+ parser.add_argument('--blastxml', action='append', help='blastxml format from tblastn')
+
+ # BAM format
+ parser.add_argument('--bam', action='append', help='BAM format from HISAT')
+
+ # BIGWIG format
+ parser.add_argument('--bigwig', action='append', help='BIGWIG format to show rnaseq coverage')
+
+ # GTF format
+ parser.add_argument('--gtf', action='append', help='GTF format from StringTie')
+
+ # Metadata json format
+ parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
+
+ #JBrowse host
+ parser.add_argument('--jbrowse_host', help="JBrowse Host")
+
+ args = parser.parse_args()
+ all_datatype_dictionary = dict()
+
+
+ if not args.fasta:
+ parser.print_help()
+ raise RuntimeError("No reference genome\n")
+ reference = args.fasta
+ genome = 'unknown'
+ out_path = 'unknown.html'
+ extra_files_path = '.'
+ tool_directory = '.'
+ jbrowse_host = ''
+ if args.jbrowse_host:
+ jbrowse_host = args.jbrowse_host
+ if args.genome_name:
+ genome = args.genome_name
+ if args.out:
+ out_path = args.out
+ if args.extra_files_path:
+ extra_files_path = args.extra_files_path
+
+ #tool_directory not work for Galaxy tool, all tools need to exist in the current PATH, deal with it with tool dependencies
+ if args.tool_directory:
+ tool_directory = args.tool_directory
+
+ #Calculate chromsome sizes using genome reference and uscs tools
+ chrom_size = utils.getChromSizes(reference, tool_directory)
+
+ #get metadata from json file
+ json_inputs_data = args.data_json
+ if json_inputs_data:
+ inputs_data = json.loads(json_inputs_data)
+ else:
+ inputs_data = {}
+
+ #print inputs_data
+
+ #Initate trackObject
+ all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path)
+
+ array_inputs_bam = args.bam
+ array_inputs_bed = args.bed
+ array_inputs_bed_simple_repeats = args.bedSimpleRepeats
+ array_inputs_bed_splice_junctions = args.bedSpliceJunctions
+ array_inputs_bigwig = args.bigwig
+ array_inputs_gff3 = args.gff3
+ array_inputs_gff3_transcript = args.gff3_transcript
+ array_inputs_gff3_mrna = args.gff3_mrna
+ array_inputs_gtf = args.gtf
+ array_inputs_blastxml = args.blastxml
+
+ if array_inputs_bam:
+ all_datatype_dictionary['bam'] = array_inputs_bam
+ if array_inputs_bed:
+ all_datatype_dictionary['bed'] = array_inputs_bed
+ if array_inputs_bed_simple_repeats:
+ all_datatype_dictionary['bedSimpleRepeats'] = array_inputs_bed_simple_repeats
+ if array_inputs_bed_splice_junctions:
+ all_datatype_dictionary['bedSpliceJunctions'] = array_inputs_bed_splice_junctions
+ if array_inputs_bigwig:
+ all_datatype_dictionary['bigwig'] = array_inputs_bigwig
+ if array_inputs_gff3:
+ all_datatype_dictionary['gff3'] = array_inputs_gff3
+ if array_inputs_gff3_transcript:
+ all_datatype_dictionary['gff3_transcript'] = array_inputs_gff3_transcript
+ if array_inputs_gff3_mrna:
+ all_datatype_dictionary['gff3_mrna'] = array_inputs_gff3_mrna
+ if array_inputs_gtf:
+ all_datatype_dictionary['gtf'] = array_inputs_gtf
+ if array_inputs_blastxml:
+ all_datatype_dictionary['blastxml'] = array_inputs_blastxml
+
+ print "input tracks: \n", all_datatype_dictionary
+
+ for datatype, inputfiles in all_datatype_dictionary.items():
+ try:
+ if not inputfiles:
+ raise ValueError('empty input, must provide track files!\n')
+ except IOError:
+ print 'Cannot open', datatype
+ else:
+ for f in inputfiles:
+ #metadata = {}
+ #print f
+ #if f in inputs_data.keys():
+ # metadata = inputs_data[f]
+ #print metadata
+ #Convert tracks into gff3 format
+ all_tracks.addToRaw(f, datatype)
+
+ jbrowseHub = TrackHub.TrackHub(all_tracks, reference, out_path, tool_directory, genome, extra_files_path, inputs_data, jbrowse_host)
+ jbrowseHub.createHub()
+
+"""
+def extractMetadata(array_inputs, inputs_data):
+ metadata_dict = {}
+ for input_false_path in array_inputs:
+ for key, data_value in inputs_data.items():
+ if key == input_false_path:
+ metadata_dict[input_false_path]
+"""
+
+if __name__ == "__main__":
+ main(sys.argv)
+
diff -r 000000000000 -r 804a93e87cc8 jbrowse_hub.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jbrowse_hub.xml Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,293 @@
+
+
+ This Galaxy tool is used to prepare your files to be ready for displaying on JBrowse
+
+
+
+ samtools
+ numpy
+ biopython
+ ucsc_tools_340
+ jbrowse_tools
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This Galaxy tool will create a tar file which including raw datasets and json datasets that can be used for
+ JBrowse visualization.
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 804a93e87cc8 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,94 @@
+
+
+
+
+
+
+
+
+
+
+
+
+This package is based on package_biopython_1_67 owned by biopython.
+https://toolshed.g2.bx.psu.edu/repository?user_id=fd5c6d0f82f315d8
+
+This Galaxy Tool Shed package installs Biopython from source, having
+first installed NumPy which is a build time depencency. This requires
+and assumes a standard C compiler is already installed, along with
+the Python header files.
+
+Development of this dependency definition is being done here on GitHub:
+https://github.com/biopython/galaxy_packages
+
+The PYTHONPATH for biopython can be accessed through PYTHONPATH_BIOPYTHON.
+
+
+
+ http://biopython.org/DIST/biopython-1.68.tar.gz
+
+
+
+
+
+ $INSTALL_DIR/lib/python
+
+ export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python &&
+ export PATH=$PATH:$PATH_NUMPY &&
+ export PYTHONPATH=$PYTHONPATH:$PYTHONPATH_NUMPY &&
+ python setup.py install --install-lib $INSTALL_DIR/lib/python
+
+
+ $INSTALL_DIR/lib/python
+ $ENV[PYTHONPATH_NUMPY]
+ $ENV[PATH_NUMPY]
+ $INSTALL_DIR/lib/python
+
+
+
+
+
+
+
+
+
+ http://old-gep.wustl.edu/~galaxy/ucsc_tools_340.tar.gz
+
+ .
+ $INSTALL_DIR/bin
+
+
+
+ $INSTALL_DIR/bin
+
+
+
+ The well known UCSC tools from Jim Kent.
+
+
+
+
+
+ http://jbrowse.org/wordpress/wp-content/plugins/download-monitor/download.php?id=105
+ $INSTALL_DIR/jbrowse
+
+ ./setup.sh
+
+
+ .
+ $INSTALL_DIR/jbrowse
+
+
+ $INSTALL_DIR/jbrowse
+ $INSTALL_DIR/jbrowse/bin
+ $INSTALL_DIR/jbrowse/src
+ $INSTALL_DIR/jbrowse/extlib
+
+
+
+
+ The perl scripts for converting flat files to json.
+
+
+
+
diff -r 000000000000 -r 804a93e87cc8 trackObject.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/trackObject.py Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+import os
+import shutil
+import utils
+import bedToGff3
+import blastxmlToGff3
+
+
+class trackObject:
+ def __init__(self, chrom_size, genome, extra_files_path):
+ self.chrom_size = chrom_size
+ outputDirect = os.path.join(extra_files_path, genome)
+ self.raw_folder = os.path.join(outputDirect, 'raw')
+ #Store metadata of the tracks
+ self.tracks = []
+ try:
+ if os.path.exists(self.raw_folder):
+ if os.path.isdir(self.raw_folder):
+ shutil.rmtree(self.raw_folder)
+ else:
+ os.remove(self.raw_folder)
+ os.makedirs(self.raw_folder)
+ except OSError as oserror:
+ print "Cannot create raw folder error({0}): {1}".format(oserror.errno, oserror.strerror)
+
+ def addToRaw(self, dataFile, dataType):
+ """
+ Convert gff3, BED, blastxml and gtf files into gff3 files
+ and store converted files in folder 'raw'
+ """
+ false_path = os.path.abspath(dataFile)
+ fileName = os.path.basename(dataFile)
+ des_path = os.path.join(self.raw_folder, fileName)
+ track = {}
+ if dataType == 'bed' or dataType == 'gff3' or dataType == 'gff3_mrna' or dataType == 'gff3_transcript' or dataType == 'fasta' or dataType == 'bam' or dataType == 'bigwig':
+ if dataType == 'bam':
+ # JBrowse will raise error: not a BAM file if the filename hasn't .bam extension
+ extension = os.path.splitext(fileName)[1]
+ if extension != '.bam':
+ fileName = fileName + '.bam'
+ des_path = os.path.join(self.raw_folder, fileName)
+ bam_index = utils.createBamIndex(dataFile)
+ indexname = os.path.basename(bam_index)
+ des_path_for_index = os.path.join(self.raw_folder, indexname)
+ shutil.copyfile(bam_index, des_path_for_index)
+ track['index'] = indexname
+
+ try:
+ shutil.copyfile(dataFile, des_path)
+ except shutil.Error as err1:
+ print "Cannot move file, error({0}: {1})".format(err1.errno, err1.strerror)
+ except IOError as err2:
+ print "Cannot move file, error({0}: {1})".format(err2.errno, err2.strerror)
+ elif dataType == 'bedSimpleRepeats':
+ bedToGff3.bedToGff3(dataFile, self.chrom_size, 'trfbig', des_path)
+ elif dataType == 'bedSpliceJunctions':
+ bedToGff3.bedToGff3(dataFile, self.chrom_size, 'regtools', des_path)
+ elif dataType == 'blastxml':
+ blastxmlToGff3.blastxml2gff3(dataFile, des_path)
+ elif dataType == 'gtf':
+ utils.gtfToGff3(dataFile, des_path, self.chrom_size)
+ track['fileName'] = fileName
+ track['dataType'] = dataType
+ track['false_path'] = false_path
+ #self.SetMetadata(track, metaData)
+ self.tracks.append(track)
+
+
\ No newline at end of file
diff -r 000000000000 -r 804a93e87cc8 utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+"""
+This file include common used functions for converting file format to gff3
+"""
+from collections import OrderedDict
+import json
+import subprocess
+import os
+import tempfile
+import string
+
+def write_features(field, attribute, gff3):
+ """
+ The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
+ field, attribute are ordered dictionary
+ gff3 is the file handler
+ """
+ attr = []
+ for v in field.values():
+ gff3.write(str(v) + '\t')
+ for k, v in attribute.items():
+ s = str(k) + '=' + str(v)
+ attr.append(s)
+ gff3.write(';'.join(attr))
+ gff3.write('\n')
+
+def getChromSizes(reference, tool_dir):
+ #TODO: find a better way instead of shipping the two exec files with the tool
+ faToTwoBit = os.path.join(tool_dir, 'faToTwoBit')
+ twoBitInfo = os.path.join(tool_dir, 'twoBitInfo')
+ try:
+ twoBitFile = tempfile.NamedTemporaryFile(bufsize=0)
+ chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False)
+ except IOError as err:
+ print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror)
+ try:
+ subprocess.call(['faToTwoBit', reference, twoBitFile.name])
+ except OSError as err:
+ print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror)
+ try:
+ subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name])
+ except OSError as err:
+ print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror)
+ return chrom_sizes
+
+def sequence_region(chrom_sizes):
+ """
+ This function read from a chromatin size file generated by twoBitInfo and write the information to dict
+ return a dict
+ """
+ f = open(chrom_sizes, 'r')
+ sizes = f.readlines()
+ sizes_dict = {}
+ for line in sizes:
+ chrom_info = line.rstrip().split('\t')
+ sizes_dict[chrom_info[0]] = chrom_info[1]
+ return sizes_dict
+
+def child_blocks(parent_field, parent_attr, gff3):
+ num = 0
+ blockcount = int(parent_attr['blockcount'])
+ chromstart = parent_attr['chromstarts'].split(',')
+ blocksize = parent_attr['blocksizes'].split(',')
+ while num < blockcount:
+ child_attr = OrderedDict()
+ child_field = parent_field
+ child_field['type'] = 'exon_junction'
+ child_field['start'] = int(chromstart[num]) + int(parent_field['start'])
+ child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1
+ child_attr['ID'] = parent_attr['ID'] + '_exon_' + str(num+1)
+ child_attr['Parent'] = parent_attr['ID']
+ write_features(child_field, child_attr, gff3)
+ num = num + 1
+
+def add_tracks_to_json(trackList_json, new_tracks, modify_type):
+ """
+ Add to track configuration (trackList.json)
+ # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict()
+ # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict())
+ """
+ with open(trackList_json, 'r+') as f:
+ data = json.load(f)
+ if modify_type == 'add_tracks':
+ data['tracks'].append(new_tracks)
+ elif modify_type == 'add_attr':
+ for k in new_tracks:
+ for track in data['tracks']:
+ if k.lower() in track['urlTemplate'].lower():
+ attr = new_tracks[k]
+ for k, v in attr.items():
+ track[k] = v
+ f.seek(0, 0)
+ f.write(json.dumps(data, separators=(',' , ':'), indent=4))
+ f.truncate()
+ f.close()
+
+def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
+ """
+ Covert gtf file output from StringTie to gff3 format
+ """
+ gff3 = open(gff3_file, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = sequence_region(chrom_sizes)
+ seq_regions = dict()
+ parents = dict()
+ with open(gtf_file, 'r') as gtf:
+ for line in gtf:
+ if line.startswith('#'):
+ continue
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ #print li
+ field['seqid'] = li[0]
+ #print field['seqid']
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[1]
+ field['type'] = li[2]
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = li[3]
+ field['end'] = li[4]
+ field['score'] = li[5]
+ field['strand'] = li[6]
+ field['phase'] = li[7]
+ attr_li = li[8].split(';')
+ gene_id = attr_li[0].split()[1].strip('"')
+ attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end'])
+ if field['type'] == 'transcript':
+ parents[gene_id] = attribute['ID']
+ attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
+ attribute['coverage'] = attr_li[2].split()[1].strip('"')
+ attribute['fpkm'] = attr_li[3].split()[1].strip('"')
+ attribute['tpm'] = attr_li[4].split()[1].strip('"')
+ elif field['type'] == 'exon':
+ attribute['Parent'] = parents[gene_id]
+ attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
+ attribute['coverage'] = attr_li[3].split()[1].strip('"')
+ write_features(field, attribute, gff3)
+ gff3.close()
+
+
+def sanitize_name(input_name):
+ """
+ Galaxy will name all the files and dirs as *.dat,
+ the function can replace '.' to '_' for the dirs
+ """
+ validChars = "_-%s%s" % (string.ascii_letters, string.digits)
+ sanitized_name = ''.join([c if c in validChars else '_' for c in input_name])
+ return "gonramp_" + sanitized_name
+
+def createBamIndex(bamfile):
+ subprocess.call(['samtools', 'index', bamfile])
+ filename = bamfile + '.bai'
+ if os.path.exists(filename):
+ return filename
+ else:
+ raise ValueError('Did not find bai file')