diff util/subtools.py @ 7:5d5fdcb798da draft

planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 12fb52d5b285935b2353d93a5aa291838df7893e
author yating-l
date Fri, 20 Apr 2018 13:51:23 -0400
parents 237707a6b74d
children 43a700afd457
line wrap: on
line diff
--- a/util/subtools.py	Thu Feb 15 17:05:05 2018 -0500
+++ b/util/subtools.py	Fri Apr 20 13:51:23 2018 -0400
@@ -9,7 +9,7 @@
 import os
 import sys
 import tempfile
-import string
+import shutil
 import logging
 
 class PopenError(Exception):
@@ -229,6 +229,84 @@
     else:
         raise ValueError('Did not find bai file')
 
+def createFastaIndex(fastaFile):
+    subprocess.call(['samtools', 'faidx', fastaFile])
+    filename = fastaFile + '.fai'
+    if os.path.exists(filename):
+        return filename
+    else:
+        raise ValueError('Did not find fai file')
+
+def generate_indexed_refseq_track(fastaFile, referenceName, outputFolder):
+    faiFile = createFastaIndex(fastaFile)
+    refSeqFile = os.path.join(outputFolder, referenceName)
+    refSeqIndexFile = os.path.join(outputFolder, referenceName+'.fai')
+    shutil.copy(fastaFile, refSeqFile)
+    shutil.copy(faiFile, refSeqIndexFile)
+
+def remove_gene_lines(gff3_file, gff3_filtered):
+    with open(gff3_file, 'r') as f:
+        with open(gff3_filtered, 'w') as out:
+            for line in f:
+                if not line.startswith('#'):
+                    feature_type = line.split('\t')[2].rstrip()
+                    if feature_type == 'transcript' or feature_type == 'mRNA':
+                        arr = line.split('\t')
+                        # as we remove the gene features, we should also remove the Parent attribute (gene id) from the transcript
+                        arr[8] = ';'.join([item for item in arr[8].split(';') if 'Parent=' not in item]).rstrip()
+                        line = '\t'.join(arr) + '\n'
+                    if feature_type == 'gene':
+                        continue
+                out.write(line)
+
+def gff3sort(inputFile, outputFile, precise=False):
+    array_call = ['gff3sort.pl', inputFile]
+    if precise:
+        array_call.append('--precise')
+    p = _handleExceptionAndCheckCall(array_call, stdout=outputFile)
+    return p
+
+def bedSort(inputFile, outputFile):
+    array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile]
+    p = _handleExceptionAndCheckCall(array_call, stdout=outputFile)
+    return p
+
+def bgzip(inputFile):
+    subprocess.call(['bgzip', inputFile])
+    filename = inputFile + '.gz'
+    if os.path.exists(filename):
+        return filename
+    else:
+        raise ValueError('Did not find gz file')
+
+def createTabix(inputFile, dataType):
+    subprocess.call(['tabix', '-p', dataType, inputFile])
+    filename = inputFile + '.tbi'
+    if os.path.exists(filename):
+        return filename
+    else:
+        raise ValueError('Did not find tbi file')
+
+def generate_tabix_indexed_track(inputFile, dataType, trackName, outputFolder):
+    if "bed" in dataType:
+        fileType = 'bed'
+        sortedFile = tempfile.NamedTemporaryFile(bufsize=0)
+        bedSort(inputFile, sortedFile)
+    elif "gff" in dataType:
+        fileType = 'gff'
+        filteredFile = tempfile.NamedTemporaryFile(bufsize=0)
+        remove_gene_lines(inputFile, filteredFile.name)
+        sortedFile = tempfile.NamedTemporaryFile(bufsize=0)
+        gff3sort(filteredFile.name, sortedFile)
+        # add .gff3.gz extension to Tabix GFF3 files, in order to enable creating name index with generate-names.pl
+        trackName = trackName + '.gff3.gz'
+    compressedFile = bgzip(sortedFile.name)
+    tabixFile = createTabix(compressedFile, fileType)
+    trackPath = os.path.join(outputFolder, trackName)
+    trackIndexPath = os.path.join(outputFolder, trackName+'.tbi')
+    shutil.copy(compressedFile, trackPath)
+    shutil.copy(tabixFile, trackIndexPath)
+
 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True):
     if "bed" in dataType:
         fileType = "--bed"
@@ -297,13 +375,15 @@
     p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout)
     return p
 
-def prepare_refseqs(fasta_file_name, outputFolder):
-    array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder]
+def prepare_refseqs(fastaFile, outputFolder):
+    #array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder]
+    createFastaIndex(fastaFile)
+    array_call = ['prepare-refseqs.pl', '--indexed_fasta', fastaFile, '--out', outputFolder]
     p = _handleExceptionAndCheckCall(array_call)
     return p       
 
-def generate_names(outputFolder):
-    array_call = ['generate-names.pl', '-v', '--out', outputFolder]
+def generate_names(outputFolder, hashBits=4):
+    array_call = ['generate-names.pl', '--hashBits', '4', '-v', '--out', outputFolder]
     p = _handleExceptionAndCheckCall(array_call)
     return p