Mercurial > repos > yating-l > jbrowsearchivecreator
comparison utils.py @ 0:804a93e87cc8 draft
planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f22711ea7a464bdaf4d5aaea07f2eacf967aa66e-dirty
author | yating-l |
---|---|
date | Wed, 12 Apr 2017 17:41:55 -0400 |
parents | |
children | 7e471cdd9e71 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:804a93e87cc8 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 This file include common used functions for converting file format to gff3 | |
5 """ | |
6 from collections import OrderedDict | |
7 import json | |
8 import subprocess | |
9 import os | |
10 import tempfile | |
11 import string | |
12 | |
13 def write_features(field, attribute, gff3): | |
14 """ | |
15 The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) | |
16 field, attribute are ordered dictionary | |
17 gff3 is the file handler | |
18 """ | |
19 attr = [] | |
20 for v in field.values(): | |
21 gff3.write(str(v) + '\t') | |
22 for k, v in attribute.items(): | |
23 s = str(k) + '=' + str(v) | |
24 attr.append(s) | |
25 gff3.write(';'.join(attr)) | |
26 gff3.write('\n') | |
27 | |
28 def getChromSizes(reference, tool_dir): | |
29 #TODO: find a better way instead of shipping the two exec files with the tool | |
30 faToTwoBit = os.path.join(tool_dir, 'faToTwoBit') | |
31 twoBitInfo = os.path.join(tool_dir, 'twoBitInfo') | |
32 try: | |
33 twoBitFile = tempfile.NamedTemporaryFile(bufsize=0) | |
34 chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False) | |
35 except IOError as err: | |
36 print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror) | |
37 try: | |
38 subprocess.call(['faToTwoBit', reference, twoBitFile.name]) | |
39 except OSError as err: | |
40 print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror) | |
41 try: | |
42 subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name]) | |
43 except OSError as err: | |
44 print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror) | |
45 return chrom_sizes | |
46 | |
47 def sequence_region(chrom_sizes): | |
48 """ | |
49 This function read from a chromatin size file generated by twoBitInfo and write the information to dict | |
50 return a dict | |
51 """ | |
52 f = open(chrom_sizes, 'r') | |
53 sizes = f.readlines() | |
54 sizes_dict = {} | |
55 for line in sizes: | |
56 chrom_info = line.rstrip().split('\t') | |
57 sizes_dict[chrom_info[0]] = chrom_info[1] | |
58 return sizes_dict | |
59 | |
60 def child_blocks(parent_field, parent_attr, gff3): | |
61 num = 0 | |
62 blockcount = int(parent_attr['blockcount']) | |
63 chromstart = parent_attr['chromstarts'].split(',') | |
64 blocksize = parent_attr['blocksizes'].split(',') | |
65 while num < blockcount: | |
66 child_attr = OrderedDict() | |
67 child_field = parent_field | |
68 child_field['type'] = 'exon_junction' | |
69 child_field['start'] = int(chromstart[num]) + int(parent_field['start']) | |
70 child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1 | |
71 child_attr['ID'] = parent_attr['ID'] + '_exon_' + str(num+1) | |
72 child_attr['Parent'] = parent_attr['ID'] | |
73 write_features(child_field, child_attr, gff3) | |
74 num = num + 1 | |
75 | |
76 def add_tracks_to_json(trackList_json, new_tracks, modify_type): | |
77 """ | |
78 Add to track configuration (trackList.json) | |
79 # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict() | |
80 # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict()) | |
81 """ | |
82 with open(trackList_json, 'r+') as f: | |
83 data = json.load(f) | |
84 if modify_type == 'add_tracks': | |
85 data['tracks'].append(new_tracks) | |
86 elif modify_type == 'add_attr': | |
87 for k in new_tracks: | |
88 for track in data['tracks']: | |
89 if k.lower() in track['urlTemplate'].lower(): | |
90 attr = new_tracks[k] | |
91 for k, v in attr.items(): | |
92 track[k] = v | |
93 f.seek(0, 0) | |
94 f.write(json.dumps(data, separators=(',' , ':'), indent=4)) | |
95 f.truncate() | |
96 f.close() | |
97 | |
98 def gtfToGff3(gtf_file, gff3_file, chrom_sizes): | |
99 """ | |
100 Covert gtf file output from StringTie to gff3 format | |
101 """ | |
102 gff3 = open(gff3_file, 'w') | |
103 gff3.write("##gff-version 3\n") | |
104 sizes_dict = sequence_region(chrom_sizes) | |
105 seq_regions = dict() | |
106 parents = dict() | |
107 with open(gtf_file, 'r') as gtf: | |
108 for line in gtf: | |
109 if line.startswith('#'): | |
110 continue | |
111 field = OrderedDict() | |
112 attribute = OrderedDict() | |
113 li = line.rstrip().split("\t") | |
114 #print li | |
115 field['seqid'] = li[0] | |
116 #print field['seqid'] | |
117 if field['seqid'] not in seq_regions: | |
118 end_region = sizes_dict[field['seqid']] | |
119 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') | |
120 seq_regions[field['seqid']] = end_region | |
121 field['source'] = li[1] | |
122 field['type'] = li[2] | |
123 # The first base in a chromosome is numbered 0 in BED format | |
124 field['start'] = li[3] | |
125 field['end'] = li[4] | |
126 field['score'] = li[5] | |
127 field['strand'] = li[6] | |
128 field['phase'] = li[7] | |
129 attr_li = li[8].split(';') | |
130 gene_id = attr_li[0].split()[1].strip('"') | |
131 attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end']) | |
132 if field['type'] == 'transcript': | |
133 parents[gene_id] = attribute['ID'] | |
134 attribute['transcript_id'] = attr_li[1].split()[1].strip('"') | |
135 attribute['coverage'] = attr_li[2].split()[1].strip('"') | |
136 attribute['fpkm'] = attr_li[3].split()[1].strip('"') | |
137 attribute['tpm'] = attr_li[4].split()[1].strip('"') | |
138 elif field['type'] == 'exon': | |
139 attribute['Parent'] = parents[gene_id] | |
140 attribute['transcript_id'] = attr_li[1].split()[1].strip('"') | |
141 attribute['coverage'] = attr_li[3].split()[1].strip('"') | |
142 write_features(field, attribute, gff3) | |
143 gff3.close() | |
144 | |
145 | |
146 def sanitize_name(input_name): | |
147 """ | |
148 Galaxy will name all the files and dirs as *.dat, | |
149 the function can replace '.' to '_' for the dirs | |
150 """ | |
151 validChars = "_-%s%s" % (string.ascii_letters, string.digits) | |
152 sanitized_name = ''.join([c if c in validChars else '_' for c in input_name]) | |
153 return "gonramp_" + sanitized_name | |
154 | |
155 def createBamIndex(bamfile): | |
156 subprocess.call(['samtools', 'index', bamfile]) | |
157 filename = bamfile + '.bai' | |
158 if os.path.exists(filename): | |
159 return filename | |
160 else: | |
161 raise ValueError('Did not find bai file') |