Mercurial > repos > yating-l > jbrowsearchivecreator
diff util/subtools.py @ 7:5d5fdcb798da draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 12fb52d5b285935b2353d93a5aa291838df7893e
author | yating-l |
---|---|
date | Fri, 20 Apr 2018 13:51:23 -0400 |
parents | 237707a6b74d |
children | 43a700afd457 |
line wrap: on
line diff
--- a/util/subtools.py Thu Feb 15 17:05:05 2018 -0500 +++ b/util/subtools.py Fri Apr 20 13:51:23 2018 -0400 @@ -9,7 +9,7 @@ import os import sys import tempfile -import string +import shutil import logging class PopenError(Exception): @@ -229,6 +229,84 @@ else: raise ValueError('Did not find bai file') +def createFastaIndex(fastaFile): + subprocess.call(['samtools', 'faidx', fastaFile]) + filename = fastaFile + '.fai' + if os.path.exists(filename): + return filename + else: + raise ValueError('Did not find fai file') + +def generate_indexed_refseq_track(fastaFile, referenceName, outputFolder): + faiFile = createFastaIndex(fastaFile) + refSeqFile = os.path.join(outputFolder, referenceName) + refSeqIndexFile = os.path.join(outputFolder, referenceName+'.fai') + shutil.copy(fastaFile, refSeqFile) + shutil.copy(faiFile, refSeqIndexFile) + +def remove_gene_lines(gff3_file, gff3_filtered): + with open(gff3_file, 'r') as f: + with open(gff3_filtered, 'w') as out: + for line in f: + if not line.startswith('#'): + feature_type = line.split('\t')[2].rstrip() + if feature_type == 'transcript' or feature_type == 'mRNA': + arr = line.split('\t') + # as we remove the gene features, we should also remove the Parent attribute (gene id) from the transcript + arr[8] = ';'.join([item for item in arr[8].split(';') if 'Parent=' not in item]).rstrip() + line = '\t'.join(arr) + '\n' + if feature_type == 'gene': + continue + out.write(line) + +def gff3sort(inputFile, outputFile, precise=False): + array_call = ['gff3sort.pl', inputFile] + if precise: + array_call.append('--precise') + p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) + return p + +def bedSort(inputFile, outputFile): + array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile] + p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) + return p + +def bgzip(inputFile): + subprocess.call(['bgzip', inputFile]) + filename = inputFile + '.gz' + if os.path.exists(filename): + return filename + else: + raise ValueError('Did not find gz file') + +def createTabix(inputFile, dataType): + subprocess.call(['tabix', '-p', dataType, inputFile]) + filename = inputFile + '.tbi' + if os.path.exists(filename): + return filename + else: + raise ValueError('Did not find tbi file') + +def generate_tabix_indexed_track(inputFile, dataType, trackName, outputFolder): + if "bed" in dataType: + fileType = 'bed' + sortedFile = tempfile.NamedTemporaryFile(bufsize=0) + bedSort(inputFile, sortedFile) + elif "gff" in dataType: + fileType = 'gff' + filteredFile = tempfile.NamedTemporaryFile(bufsize=0) + remove_gene_lines(inputFile, filteredFile.name) + sortedFile = tempfile.NamedTemporaryFile(bufsize=0) + gff3sort(filteredFile.name, sortedFile) + # add .gff3.gz extension to Tabix GFF3 files, in order to enable creating name index with generate-names.pl + trackName = trackName + '.gff3.gz' + compressedFile = bgzip(sortedFile.name) + tabixFile = createTabix(compressedFile, fileType) + trackPath = os.path.join(outputFolder, trackName) + trackIndexPath = os.path.join(outputFolder, trackName+'.tbi') + shutil.copy(compressedFile, trackPath) + shutil.copy(tabixFile, trackIndexPath) + def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): if "bed" in dataType: fileType = "--bed" @@ -297,13 +375,15 @@ p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) return p -def prepare_refseqs(fasta_file_name, outputFolder): - array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] +def prepare_refseqs(fastaFile, outputFolder): + #array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] + createFastaIndex(fastaFile) + array_call = ['prepare-refseqs.pl', '--indexed_fasta', fastaFile, '--out', outputFolder] p = _handleExceptionAndCheckCall(array_call) return p -def generate_names(outputFolder): - array_call = ['generate-names.pl', '-v', '--out', outputFolder] +def generate_names(outputFolder, hashBits=4): + array_call = ['generate-names.pl', '--hashBits', '4', '-v', '--out', outputFolder] p = _handleExceptionAndCheckCall(array_call) return p