comparison htsc_create_sliding_windows.py @ 0:94a987a7da69 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author bgruening
date Tue, 11 Oct 2022 16:09:23 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:94a987a7da69
1 #!/usr/bin/env python3
2
3 import argparse
4 import os
5 import subprocess
6
7 """
8
9 Install deseq-clip
10 ==================
11
12 conda install -c bioconda pysam
13 conda install -c bioconda htseq
14 pip install htseq-clip
15
16 Or directly by:
17 conda install -c bioconda htseq-clip
18
19
20 Test call
21 =========
22
23 python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper
24
25 Compare:
26 diff test_compare_out/windows_mapped_to_ids.txt test-data/windows.exp.txt
27 diff test_compare_out/windows.bed test-data/windows.exp.bed
28 diff test_compare_out/annotation.bed test-data/annotation.exp.bed
29
30 This corresponds to:
31 htseq-clip annotation -g test-data/paper_tus.Synechocystis_pSYSM.gff3 -o test-data/annotation.exp.bed
32 htseq-clip createSlidingWindows -i test-data/annotation.exp.bed -w 50 -s 20 -o test-data/windows.exp.bed
33 htseq-clip mapToId -a test-data/windows.exp.bed -o test-data/windows.exp.txt
34
35 More tests:
36 python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper --hca-unsorted
37
38 DEWSeq input files:
39 test_compare_out/windows_mapped_to_ids.txt
40
41 """
42
43
44 ################################################################################
45
46 def setup_argument_parser():
47 """Setup argparse parser."""
48 help_description = """
49 Based on genomic annotations GFF file (--gff), create sliding window
50 annotations with htseq-clip.
51
52 """
53 # Define argument parser.
54 p = argparse.ArgumentParser(add_help=False,
55 prog="htsc_create_sliding_windows.py",
56 description=help_description,
57 formatter_class=argparse.MetavarTypeHelpFormatter)
58
59 # Required arguments.
60 p.add_argument("-h", "--help",
61 action="help",
62 help="Print help message")
63 p.add_argument("--gff",
64 dest="in_gff",
65 type=str,
66 metavar='str',
67 required=True,
68 help="Annotation file GFF (so far tested with hg38 GENCODE format). Also accepts gff.gz as well")
69 p.add_argument("--out",
70 dest="out_folder",
71 type=str,
72 metavar='str',
73 required=True,
74 help="Results output folder")
75 # htseq-clip annotation.
76 p.add_argument("--hca-unsorted",
77 dest="hca_unsorted",
78 default=False,
79 action="store_true",
80 help="htseq-clip annotation --unsorted parameter. Use this flag if the GFF file is unsorted (default: False)")
81 # htseq-clip createSlidingWindows.
82 p.add_argument("--hcw-w",
83 dest="hcw_w",
84 type=int,
85 metavar='int',
86 default=50,
87 help="htseq-clip createSlidingWindows -w parameter. Sliding window size. If unsure, try 75-100 (default: 50)")
88 p.add_argument("--hcw-s",
89 dest="hcw_s",
90 type=int,
91 metavar='int',
92 default=20,
93 help="htseq-clip createSlidingWindows -s parameter. Step size for sliding window (default: 20)")
94 # More.
95 p.add_argument("--no-zipper",
96 dest="no_zipper",
97 default=False,
98 action="store_true",
99 help="Do not gzip output files (default: False)")
100 return p
101
102
103 ################################################################################
104
105 if __name__ == '__main__':
106
107 parser = setup_argument_parser()
108 args = parser.parse_args()
109
110 assert os.path.exists(args.in_gff), "--gff file \"%s\" not found" % (args.in_gff)
111
112 # Output folder.
113 if not os.path.exists(args.out_folder):
114 os.makedirs(args.out_folder)
115
116 """
117 1) Flatten annotation.
118 htseq-clip annotation -g args.in_gff -o annotation.bed
119
120 -o content example:
121 Synechocystis 5 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001 2 -
122 Synechocystis 576 990 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001 2 +
123 Synechocystis 809 909 TU3@TU3@protein_coding@exon@1/1@TU3:exon0001 2 -
124 Synechocystis 1531 2150 TU4@TU4@protein_coding@exon@1/1@TU4:exon0001 2 +
125 Synechocystis 2150 2701 TU6@TU6@protein_coding@exon@1/1@TU6:exon0001 2 +
126 ...
127
128 """
129
130 annot_bed = args.out_folder + "/annotation.bed.gz"
131 if args.no_zipper:
132 annot_bed = args.out_folder + "/annotation.bed"
133
134 print("Convert --gff to BED ... ")
135 check_cmd = "htseq-clip annotation -g " + args.in_gff + " -o " + annot_bed
136 if args.hca_unsorted:
137 check_cmd += " --unsorted"
138 output = subprocess.getoutput(check_cmd)
139
140 print(check_cmd)
141 print(output)
142
143 assert os.path.exists(annot_bed), "htseq-clip annotation -o file \"%s\" not found" % (annot_bed)
144
145 """
146 2) Create sliding windows.
147 htseq-clip createSlidingWindows -i annotation.bed -w hcw_w -s hcw_s -o windows.bed.gz
148
149 -o content example:
150 Synechocystis 5 80 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00001@1 2 -
151 Synechocystis 15 90 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00002@2 2 -
152 Synechocystis 25 100 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00003@3 2 -
153 Synechocystis 35 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00004@4 2 -
154 Synechocystis 576 651 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00001@1 2 +
155 Synechocystis 586 661 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00002@2 2 +
156 Synechocystis 596 671 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00003@3 2 +
157 Synechocystis 606 681 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00004@4 2 +
158 Synechocystis 616 691 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00005@5 2 +
159 ...
160
161 """
162
163 win_bed = args.out_folder + "/windows.bed.gz"
164 if args.no_zipper:
165 win_bed = args.out_folder + "/windows.bed"
166
167 print("Create sliding windows BED ... ")
168 win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s)
169 check_cmd = "htseq-clip createSlidingWindows -i " + annot_bed + win_params + " -o " + win_bed
170 output = subprocess.getoutput(check_cmd)
171
172 print(check_cmd)
173 print(output)
174
175 assert os.path.exists(annot_bed), "htseq-clip createSlidingWindows -o file \"%s\" not found" % (win_bed)
176
177 """
178 3) Create mapping file for DEWSeq.
179 mapToId: extract "name" column from the annotation file and map the entries
180 to unique id and print out in tab separated format.
181 htseq-clip mapToId -a windows.bed.gz -o windows.txt.gz
182
183 -o content example:
184 unique_id chromosome begin end strand gene_id gene_name gene_type gene_region Nr_of_region Total_nr_of_region window_number
185 TU1:exon0001W00001 Synechocystis 5 80 - TU1 TU1 protein_coding exon 1 1 1
186 TU1:exon0001W00002 Synechocystis 15 90 - TU1 TU1 protein_coding exon 1 1 2
187 TU1:exon0001W00003 Synechocystis 25 100 - TU1 TU1 protein_coding exon 1 1 3
188 TU1:exon0001W00004 Synechocystis 35 105 - TU1 TU1 protein_coding exon 1 1 4
189 TU2:exon0001W00001 Synechocystis 576 651 + TU2 TU2 protein_coding exon 1 1 1
190 TU2:exon0001W00002 Synechocystis 586 661 + TU2 TU2 protein_coding exon 1 1 2
191 TU2:exon0001W00003 Synechocystis 596 671 + TU2 TU2 protein_coding exon 1 1 3
192 TU2:exon0001W00004 Synechocystis 606 681 + TU2 TU2 protein_coding exon 1 1 4
193 TU2:exon0001W00005 Synechocystis 616 691 + TU2 TU2 protein_coding exon 1 1 5
194 ...
195
196 """
197
198 mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt.gz"
199 if args.no_zipper:
200 mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt"
201
202 print("Create DEWSeq input annotation file ... ")
203 win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s)
204 check_cmd = "htseq-clip mapToId -a " + win_bed + " -o " + mapped2ids_txt
205 output = subprocess.getoutput(check_cmd)
206
207 print(check_cmd)
208 print(output)
209
210 assert os.path.exists(mapped2ids_txt), "htseq-clip mapToId -o file \"%s\" not found" % (mapped2ids_txt)
211
212 """
213 Report.
214
215 """
216
217 print("")
218 print("OUTPUT FILES")
219 print("============")
220 print("Annotation BED:\n%s" % (annot_bed))
221 print("Windows BED:\n%s" % (win_bed))
222 print("Windows mapped to IDs TXT (DEWseq annotation file):\n%s" % (mapped2ids_txt))
223 print("")