diff RM_custom_search.py.bak @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RM_custom_search.py.bak	Mon Apr 01 07:56:36 2019 -0400
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+''' RepeatMasker search against custom database
+input:
+- archive with sequencing data
+- custom repeat database
+'''
+import zipfile
+import shutil
+import os
+import subprocess
+import parallel
+import glob
+
+def extract_sequences(f):
+    # check archive:
+    try:
+        z=zipfile.ZipFile(f)
+        # extract only dirCLXXXX/reads.fas
+        seq_list = []
+        for filein in z.namelist():
+            if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"):
+                outdir = filein.split("/")[3]
+                outfile = outdir +"/reads.fas"
+                source = z.open(filein)
+                os.mkdir(outdir)
+                target = file(outfile, "wb")
+                shutil.copyfileobj(source, target)
+                seq_list.append(outfile)
+        if  len(seq_list) == 0:
+            raise ValueError()
+                
+    except zipfile.BadZipfile as e:
+        print "unable to extract sequences from archive!"
+        raise e
+    
+    except IOError as e:
+        print "unable to extract sequences from archive!"
+        raise e
+
+    except ValueError as e:
+        print "No sequences found in archive!"
+        raise e
+    
+    seq_list.sort()
+    return seq_list
+
+def get_RM_dir(config_file,galaxy_dir):
+    shutil.copy(config_file,"seqclust.config")
+    f = open("seqclust.config",'a')
+    f.write("\necho $REPEAT_MASKER")
+    f.close()
+    args = ["bash", "seqclust.config"]
+    p = subprocess.Popen(args,stdout = subprocess.PIPE)
+    RMdir = "{0}{1}".format(galaxy_dir,p.stdout.readline().strip())
+    return RMdir
+    
+def RepeatMasker(RM,reads,database):
+    args = [RM, reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)]
+    status=subprocess.call(args , stderr = open(os.devnull, 'wb'))
+    return status
+
+def summarizeRepeatMaskerOutput(htmlout = "summary.html"):
+    cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r"
+    args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table"  ]
+    status=subprocess.call(args)
+    cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R"
+    args = [cmd, htmlout]
+    status=subprocess.call(args)
+    return status
+    
+
+def main():
+    from optparse import OptionParser
+    
+    parser = OptionParser()
+    parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive")
+    parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database")
+    parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory")
+    parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html')
+    
+    options, args = parser.parse_args()
+    config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config"
+    
+    
+    seq_files = extract_sequences(options.input_file)  ### REMOVE - TESTING
+    RMdir = get_RM_dir(config_file, options.galaxy_dir)
+    parallel.parallel(RepeatMasker, [RMdir+"/RepeatMasker"], seq_files, [options.database])
+    
+    status = summarizeRepeatMaskerOutput(options.report)
+    
+     
+        
+if __name__== "__main__":
+    main()
+  
+