Mercurial > repos > yating-l > jbrowsearchivecreator
comparison util/subtools.py @ 7:5d5fdcb798da draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 12fb52d5b285935b2353d93a5aa291838df7893e
| author | yating-l |
|---|---|
| date | Fri, 20 Apr 2018 13:51:23 -0400 |
| parents | 237707a6b74d |
| children | 43a700afd457 |
comparison
equal
deleted
inserted
replaced
| 6:237707a6b74d | 7:5d5fdcb798da |
|---|---|
| 7 import json | 7 import json |
| 8 import subprocess | 8 import subprocess |
| 9 import os | 9 import os |
| 10 import sys | 10 import sys |
| 11 import tempfile | 11 import tempfile |
| 12 import string | 12 import shutil |
| 13 import logging | 13 import logging |
| 14 | 14 |
| 15 class PopenError(Exception): | 15 class PopenError(Exception): |
| 16 def __init__(self, cmd, error, return_code): | 16 def __init__(self, cmd, error, return_code): |
| 17 self.cmd = cmd | 17 self.cmd = cmd |
| 226 filename = bamfile + '.bai' | 226 filename = bamfile + '.bai' |
| 227 if os.path.exists(filename): | 227 if os.path.exists(filename): |
| 228 return filename | 228 return filename |
| 229 else: | 229 else: |
| 230 raise ValueError('Did not find bai file') | 230 raise ValueError('Did not find bai file') |
| 231 | |
| 232 def createFastaIndex(fastaFile): | |
| 233 subprocess.call(['samtools', 'faidx', fastaFile]) | |
| 234 filename = fastaFile + '.fai' | |
| 235 if os.path.exists(filename): | |
| 236 return filename | |
| 237 else: | |
| 238 raise ValueError('Did not find fai file') | |
| 239 | |
| 240 def generate_indexed_refseq_track(fastaFile, referenceName, outputFolder): | |
| 241 faiFile = createFastaIndex(fastaFile) | |
| 242 refSeqFile = os.path.join(outputFolder, referenceName) | |
| 243 refSeqIndexFile = os.path.join(outputFolder, referenceName+'.fai') | |
| 244 shutil.copy(fastaFile, refSeqFile) | |
| 245 shutil.copy(faiFile, refSeqIndexFile) | |
| 246 | |
| 247 def remove_gene_lines(gff3_file, gff3_filtered): | |
| 248 with open(gff3_file, 'r') as f: | |
| 249 with open(gff3_filtered, 'w') as out: | |
| 250 for line in f: | |
| 251 if not line.startswith('#'): | |
| 252 feature_type = line.split('\t')[2].rstrip() | |
| 253 if feature_type == 'transcript' or feature_type == 'mRNA': | |
| 254 arr = line.split('\t') | |
| 255 # as we remove the gene features, we should also remove the Parent attribute (gene id) from the transcript | |
| 256 arr[8] = ';'.join([item for item in arr[8].split(';') if 'Parent=' not in item]).rstrip() | |
| 257 line = '\t'.join(arr) + '\n' | |
| 258 if feature_type == 'gene': | |
| 259 continue | |
| 260 out.write(line) | |
| 261 | |
| 262 def gff3sort(inputFile, outputFile, precise=False): | |
| 263 array_call = ['gff3sort.pl', inputFile] | |
| 264 if precise: | |
| 265 array_call.append('--precise') | |
| 266 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) | |
| 267 return p | |
| 268 | |
| 269 def bedSort(inputFile, outputFile): | |
| 270 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile] | |
| 271 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) | |
| 272 return p | |
| 273 | |
| 274 def bgzip(inputFile): | |
| 275 subprocess.call(['bgzip', inputFile]) | |
| 276 filename = inputFile + '.gz' | |
| 277 if os.path.exists(filename): | |
| 278 return filename | |
| 279 else: | |
| 280 raise ValueError('Did not find gz file') | |
| 281 | |
| 282 def createTabix(inputFile, dataType): | |
| 283 subprocess.call(['tabix', '-p', dataType, inputFile]) | |
| 284 filename = inputFile + '.tbi' | |
| 285 if os.path.exists(filename): | |
| 286 return filename | |
| 287 else: | |
| 288 raise ValueError('Did not find tbi file') | |
| 289 | |
| 290 def generate_tabix_indexed_track(inputFile, dataType, trackName, outputFolder): | |
| 291 if "bed" in dataType: | |
| 292 fileType = 'bed' | |
| 293 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) | |
| 294 bedSort(inputFile, sortedFile) | |
| 295 elif "gff" in dataType: | |
| 296 fileType = 'gff' | |
| 297 filteredFile = tempfile.NamedTemporaryFile(bufsize=0) | |
| 298 remove_gene_lines(inputFile, filteredFile.name) | |
| 299 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) | |
| 300 gff3sort(filteredFile.name, sortedFile) | |
| 301 # add .gff3.gz extension to Tabix GFF3 files, in order to enable creating name index with generate-names.pl | |
| 302 trackName = trackName + '.gff3.gz' | |
| 303 compressedFile = bgzip(sortedFile.name) | |
| 304 tabixFile = createTabix(compressedFile, fileType) | |
| 305 trackPath = os.path.join(outputFolder, trackName) | |
| 306 trackIndexPath = os.path.join(outputFolder, trackName+'.tbi') | |
| 307 shutil.copy(compressedFile, trackPath) | |
| 308 shutil.copy(tabixFile, trackIndexPath) | |
| 231 | 309 |
| 232 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): | 310 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): |
| 233 if "bed" in dataType: | 311 if "bed" in dataType: |
| 234 fileType = "--bed" | 312 fileType = "--bed" |
| 235 elif "gff" in dataType: | 313 elif "gff" in dataType: |
| 295 track_json = json.dumps(track_json) | 373 track_json = json.dumps(track_json) |
| 296 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) | 374 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) |
| 297 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) | 375 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) |
| 298 return p | 376 return p |
| 299 | 377 |
| 300 def prepare_refseqs(fasta_file_name, outputFolder): | 378 def prepare_refseqs(fastaFile, outputFolder): |
| 301 array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] | 379 #array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] |
| 380 createFastaIndex(fastaFile) | |
| 381 array_call = ['prepare-refseqs.pl', '--indexed_fasta', fastaFile, '--out', outputFolder] | |
| 302 p = _handleExceptionAndCheckCall(array_call) | 382 p = _handleExceptionAndCheckCall(array_call) |
| 303 return p | 383 return p |
| 304 | 384 |
| 305 def generate_names(outputFolder): | 385 def generate_names(outputFolder, hashBits=4): |
| 306 array_call = ['generate-names.pl', '-v', '--out', outputFolder] | 386 array_call = ['generate-names.pl', '--hashBits', '4', '-v', '--out', outputFolder] |
| 307 p = _handleExceptionAndCheckCall(array_call) | 387 p = _handleExceptionAndCheckCall(array_call) |
| 308 return p | 388 return p |
| 309 | 389 |
| 310 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): | 390 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): |
| 311 """ | 391 """ |
