comparison RM_custom_search.py @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 16150c85fb3a
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 #!/usr/bin/env python3
2 ''' RepeatMasker search against custom database
3 input:
4 - archive with sequencing data
5 - custom repeat database
6 '''
7 import zipfile
8 import shutil
9 import os
10 import subprocess
11 from parallel import parallel2
12 import glob
13
14 print(os.environ)
15 def extract_sequences(f):
16 # check archive:
17 try:
18 z=zipfile.ZipFile(f)
19 # extract only dirCLXXXX/reads.fas
20 seq_list = []
21 for filein in z.namelist():
22 if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"):
23 outdir = filein.split("/")[3]
24 outfile = outdir +"/reads.fas"
25 source = z.open(filein)
26 os.mkdir(outdir)
27 target = open(outfile, "wb")
28 shutil.copyfileobj(source, target)
29 seq_list.append(outfile)
30 if len(seq_list) == 0:
31 raise ValueError()
32
33 except zipfile.BadZipfile as e:
34 print("unable to extract sequences from archive!")
35 raise e
36
37 except IOError as e:
38 print("unable to extract sequences from archive!")
39 raise e
40
41 except ValueError as e:
42 print("No sequences found in archive!")
43 raise e
44
45 seq_list.sort()
46 return seq_list
47
48
49 def RepeatMasker(reads,database):
50 args = ["RepeatMasker", reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)]
51 print(args)
52 status=subprocess.call(args , stderr = open(os.devnull, 'wb'))
53 return status
54
55 def summarizeRepeatMaskerOutput(htmlout = "summary.html"):
56 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r"
57 args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table" ]
58 status=subprocess.call(args)
59 cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R"
60 args = [cmd, htmlout]
61 status=subprocess.call(args)
62 return status
63
64
65 def main():
66 from optparse import OptionParser
67
68 parser = OptionParser()
69 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive")
70 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database")
71 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory")
72 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html')
73
74 options, args = parser.parse_args()
75 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config"
76
77
78 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING
79 parallel2(RepeatMasker, seq_files, [options.database])
80
81 status = summarizeRepeatMaskerOutput(options.report)
82
83
84
85 if __name__== "__main__":
86 main()
87
88