comparison utils.py @ 0:804a93e87cc8 draft

planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f22711ea7a464bdaf4d5aaea07f2eacf967aa66e-dirty
author yating-l
date Wed, 12 Apr 2017 17:41:55 -0400
parents
children 7e471cdd9e71
comparison
equal deleted inserted replaced
-1:000000000000 0:804a93e87cc8
1 #!/usr/bin/env python
2
3 """
4 This file include common used functions for converting file format to gff3
5 """
6 from collections import OrderedDict
7 import json
8 import subprocess
9 import os
10 import tempfile
11 import string
12
13 def write_features(field, attribute, gff3):
14 """
15 The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
16 field, attribute are ordered dictionary
17 gff3 is the file handler
18 """
19 attr = []
20 for v in field.values():
21 gff3.write(str(v) + '\t')
22 for k, v in attribute.items():
23 s = str(k) + '=' + str(v)
24 attr.append(s)
25 gff3.write(';'.join(attr))
26 gff3.write('\n')
27
28 def getChromSizes(reference, tool_dir):
29 #TODO: find a better way instead of shipping the two exec files with the tool
30 faToTwoBit = os.path.join(tool_dir, 'faToTwoBit')
31 twoBitInfo = os.path.join(tool_dir, 'twoBitInfo')
32 try:
33 twoBitFile = tempfile.NamedTemporaryFile(bufsize=0)
34 chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False)
35 except IOError as err:
36 print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror)
37 try:
38 subprocess.call(['faToTwoBit', reference, twoBitFile.name])
39 except OSError as err:
40 print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror)
41 try:
42 subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name])
43 except OSError as err:
44 print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror)
45 return chrom_sizes
46
47 def sequence_region(chrom_sizes):
48 """
49 This function read from a chromatin size file generated by twoBitInfo and write the information to dict
50 return a dict
51 """
52 f = open(chrom_sizes, 'r')
53 sizes = f.readlines()
54 sizes_dict = {}
55 for line in sizes:
56 chrom_info = line.rstrip().split('\t')
57 sizes_dict[chrom_info[0]] = chrom_info[1]
58 return sizes_dict
59
60 def child_blocks(parent_field, parent_attr, gff3):
61 num = 0
62 blockcount = int(parent_attr['blockcount'])
63 chromstart = parent_attr['chromstarts'].split(',')
64 blocksize = parent_attr['blocksizes'].split(',')
65 while num < blockcount:
66 child_attr = OrderedDict()
67 child_field = parent_field
68 child_field['type'] = 'exon_junction'
69 child_field['start'] = int(chromstart[num]) + int(parent_field['start'])
70 child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1
71 child_attr['ID'] = parent_attr['ID'] + '_exon_' + str(num+1)
72 child_attr['Parent'] = parent_attr['ID']
73 write_features(child_field, child_attr, gff3)
74 num = num + 1
75
76 def add_tracks_to_json(trackList_json, new_tracks, modify_type):
77 """
78 Add to track configuration (trackList.json)
79 # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict()
80 # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict())
81 """
82 with open(trackList_json, 'r+') as f:
83 data = json.load(f)
84 if modify_type == 'add_tracks':
85 data['tracks'].append(new_tracks)
86 elif modify_type == 'add_attr':
87 for k in new_tracks:
88 for track in data['tracks']:
89 if k.lower() in track['urlTemplate'].lower():
90 attr = new_tracks[k]
91 for k, v in attr.items():
92 track[k] = v
93 f.seek(0, 0)
94 f.write(json.dumps(data, separators=(',' , ':'), indent=4))
95 f.truncate()
96 f.close()
97
98 def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
99 """
100 Covert gtf file output from StringTie to gff3 format
101 """
102 gff3 = open(gff3_file, 'w')
103 gff3.write("##gff-version 3\n")
104 sizes_dict = sequence_region(chrom_sizes)
105 seq_regions = dict()
106 parents = dict()
107 with open(gtf_file, 'r') as gtf:
108 for line in gtf:
109 if line.startswith('#'):
110 continue
111 field = OrderedDict()
112 attribute = OrderedDict()
113 li = line.rstrip().split("\t")
114 #print li
115 field['seqid'] = li[0]
116 #print field['seqid']
117 if field['seqid'] not in seq_regions:
118 end_region = sizes_dict[field['seqid']]
119 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
120 seq_regions[field['seqid']] = end_region
121 field['source'] = li[1]
122 field['type'] = li[2]
123 # The first base in a chromosome is numbered 0 in BED format
124 field['start'] = li[3]
125 field['end'] = li[4]
126 field['score'] = li[5]
127 field['strand'] = li[6]
128 field['phase'] = li[7]
129 attr_li = li[8].split(';')
130 gene_id = attr_li[0].split()[1].strip('"')
131 attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end'])
132 if field['type'] == 'transcript':
133 parents[gene_id] = attribute['ID']
134 attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
135 attribute['coverage'] = attr_li[2].split()[1].strip('"')
136 attribute['fpkm'] = attr_li[3].split()[1].strip('"')
137 attribute['tpm'] = attr_li[4].split()[1].strip('"')
138 elif field['type'] == 'exon':
139 attribute['Parent'] = parents[gene_id]
140 attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
141 attribute['coverage'] = attr_li[3].split()[1].strip('"')
142 write_features(field, attribute, gff3)
143 gff3.close()
144
145
146 def sanitize_name(input_name):
147 """
148 Galaxy will name all the files and dirs as *.dat,
149 the function can replace '.' to '_' for the dirs
150 """
151 validChars = "_-%s%s" % (string.ascii_letters, string.digits)
152 sanitized_name = ''.join([c if c in validChars else '_' for c in input_name])
153 return "gonramp_" + sanitized_name
154
155 def createBamIndex(bamfile):
156 subprocess.call(['samtools', 'index', bamfile])
157 filename = bamfile + '.bai'
158 if os.path.exists(filename):
159 return filename
160 else:
161 raise ValueError('Did not find bai file')