Mercurial > repos > petr-novak > re_utils
comparison RM_custom_search.py.bak @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a4cd8608ef6b |
---|---|
1 #!/usr/bin/env python | |
2 ''' RepeatMasker search against custom database | |
3 input: | |
4 - archive with sequencing data | |
5 - custom repeat database | |
6 ''' | |
7 import zipfile | |
8 import shutil | |
9 import os | |
10 import subprocess | |
11 import parallel | |
12 import glob | |
13 | |
14 def extract_sequences(f): | |
15 # check archive: | |
16 try: | |
17 z=zipfile.ZipFile(f) | |
18 # extract only dirCLXXXX/reads.fas | |
19 seq_list = [] | |
20 for filein in z.namelist(): | |
21 if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"): | |
22 outdir = filein.split("/")[3] | |
23 outfile = outdir +"/reads.fas" | |
24 source = z.open(filein) | |
25 os.mkdir(outdir) | |
26 target = file(outfile, "wb") | |
27 shutil.copyfileobj(source, target) | |
28 seq_list.append(outfile) | |
29 if len(seq_list) == 0: | |
30 raise ValueError() | |
31 | |
32 except zipfile.BadZipfile as e: | |
33 print "unable to extract sequences from archive!" | |
34 raise e | |
35 | |
36 except IOError as e: | |
37 print "unable to extract sequences from archive!" | |
38 raise e | |
39 | |
40 except ValueError as e: | |
41 print "No sequences found in archive!" | |
42 raise e | |
43 | |
44 seq_list.sort() | |
45 return seq_list | |
46 | |
47 def get_RM_dir(config_file,galaxy_dir): | |
48 shutil.copy(config_file,"seqclust.config") | |
49 f = open("seqclust.config",'a') | |
50 f.write("\necho $REPEAT_MASKER") | |
51 f.close() | |
52 args = ["bash", "seqclust.config"] | |
53 p = subprocess.Popen(args,stdout = subprocess.PIPE) | |
54 RMdir = "{0}{1}".format(galaxy_dir,p.stdout.readline().strip()) | |
55 return RMdir | |
56 | |
57 def RepeatMasker(RM,reads,database): | |
58 args = [RM, reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)] | |
59 status=subprocess.call(args , stderr = open(os.devnull, 'wb')) | |
60 return status | |
61 | |
62 def summarizeRepeatMaskerOutput(htmlout = "summary.html"): | |
63 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r" | |
64 args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table" ] | |
65 status=subprocess.call(args) | |
66 cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R" | |
67 args = [cmd, htmlout] | |
68 status=subprocess.call(args) | |
69 return status | |
70 | |
71 | |
72 def main(): | |
73 from optparse import OptionParser | |
74 | |
75 parser = OptionParser() | |
76 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive") | |
77 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database") | |
78 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory") | |
79 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html') | |
80 | |
81 options, args = parser.parse_args() | |
82 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config" | |
83 | |
84 | |
85 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING | |
86 RMdir = get_RM_dir(config_file, options.galaxy_dir) | |
87 parallel.parallel(RepeatMasker, [RMdir+"/RepeatMasker"], seq_files, [options.database]) | |
88 | |
89 status = summarizeRepeatMaskerOutput(options.report) | |
90 | |
91 | |
92 | |
93 if __name__== "__main__": | |
94 main() | |
95 | |
96 |