Mercurial > repos > petr-novak > re_utils
view RM_custom_search.py.bak @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ''' RepeatMasker search against custom database input: - archive with sequencing data - custom repeat database ''' import zipfile import shutil import os import subprocess import parallel import glob def extract_sequences(f): # check archive: try: z=zipfile.ZipFile(f) # extract only dirCLXXXX/reads.fas seq_list = [] for filein in z.namelist(): if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"): outdir = filein.split("/")[3] outfile = outdir +"/reads.fas" source = z.open(filein) os.mkdir(outdir) target = file(outfile, "wb") shutil.copyfileobj(source, target) seq_list.append(outfile) if len(seq_list) == 0: raise ValueError() except zipfile.BadZipfile as e: print "unable to extract sequences from archive!" raise e except IOError as e: print "unable to extract sequences from archive!" raise e except ValueError as e: print "No sequences found in archive!" raise e seq_list.sort() return seq_list def get_RM_dir(config_file,galaxy_dir): shutil.copy(config_file,"seqclust.config") f = open("seqclust.config",'a') f.write("\necho $REPEAT_MASKER") f.close() args = ["bash", "seqclust.config"] p = subprocess.Popen(args,stdout = subprocess.PIPE) RMdir = "{0}{1}".format(galaxy_dir,p.stdout.readline().strip()) return RMdir def RepeatMasker(RM,reads,database): args = [RM, reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)] status=subprocess.call(args , stderr = open(os.devnull, 'wb')) return status def summarizeRepeatMaskerOutput(htmlout = "summary.html"): cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r" args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table" ] status=subprocess.call(args) cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R" args = [cmd, htmlout] status=subprocess.call(args) return status def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive") parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database") parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory") parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html') options, args = parser.parse_args() config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config" seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING RMdir = get_RM_dir(config_file, options.galaxy_dir) parallel.parallel(RepeatMasker, [RMdir+"/RepeatMasker"], seq_files, [options.database]) status = summarizeRepeatMaskerOutput(options.report) if __name__== "__main__": main()