Mercurial > repos > bgruening > htseq_clip
comparison htsc_create_sliding_windows.py @ 0:94a987a7da69 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author | bgruening |
---|---|
date | Tue, 11 Oct 2022 16:09:23 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:94a987a7da69 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 import os | |
5 import subprocess | |
6 | |
7 """ | |
8 | |
9 Install deseq-clip | |
10 ================== | |
11 | |
12 conda install -c bioconda pysam | |
13 conda install -c bioconda htseq | |
14 pip install htseq-clip | |
15 | |
16 Or directly by: | |
17 conda install -c bioconda htseq-clip | |
18 | |
19 | |
20 Test call | |
21 ========= | |
22 | |
23 python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper | |
24 | |
25 Compare: | |
26 diff test_compare_out/windows_mapped_to_ids.txt test-data/windows.exp.txt | |
27 diff test_compare_out/windows.bed test-data/windows.exp.bed | |
28 diff test_compare_out/annotation.bed test-data/annotation.exp.bed | |
29 | |
30 This corresponds to: | |
31 htseq-clip annotation -g test-data/paper_tus.Synechocystis_pSYSM.gff3 -o test-data/annotation.exp.bed | |
32 htseq-clip createSlidingWindows -i test-data/annotation.exp.bed -w 50 -s 20 -o test-data/windows.exp.bed | |
33 htseq-clip mapToId -a test-data/windows.exp.bed -o test-data/windows.exp.txt | |
34 | |
35 More tests: | |
36 python htsc_create_sliding_windows.py --gff test-data/paper_tus.Synechocystis_pSYSM.gff3 --out test_compare_out --hcw-w 50 --hcw-s 20 --no-zipper --hca-unsorted | |
37 | |
38 DEWSeq input files: | |
39 test_compare_out/windows_mapped_to_ids.txt | |
40 | |
41 """ | |
42 | |
43 | |
44 ################################################################################ | |
45 | |
46 def setup_argument_parser(): | |
47 """Setup argparse parser.""" | |
48 help_description = """ | |
49 Based on genomic annotations GFF file (--gff), create sliding window | |
50 annotations with htseq-clip. | |
51 | |
52 """ | |
53 # Define argument parser. | |
54 p = argparse.ArgumentParser(add_help=False, | |
55 prog="htsc_create_sliding_windows.py", | |
56 description=help_description, | |
57 formatter_class=argparse.MetavarTypeHelpFormatter) | |
58 | |
59 # Required arguments. | |
60 p.add_argument("-h", "--help", | |
61 action="help", | |
62 help="Print help message") | |
63 p.add_argument("--gff", | |
64 dest="in_gff", | |
65 type=str, | |
66 metavar='str', | |
67 required=True, | |
68 help="Annotation file GFF (so far tested with hg38 GENCODE format). Also accepts gff.gz as well") | |
69 p.add_argument("--out", | |
70 dest="out_folder", | |
71 type=str, | |
72 metavar='str', | |
73 required=True, | |
74 help="Results output folder") | |
75 # htseq-clip annotation. | |
76 p.add_argument("--hca-unsorted", | |
77 dest="hca_unsorted", | |
78 default=False, | |
79 action="store_true", | |
80 help="htseq-clip annotation --unsorted parameter. Use this flag if the GFF file is unsorted (default: False)") | |
81 # htseq-clip createSlidingWindows. | |
82 p.add_argument("--hcw-w", | |
83 dest="hcw_w", | |
84 type=int, | |
85 metavar='int', | |
86 default=50, | |
87 help="htseq-clip createSlidingWindows -w parameter. Sliding window size. If unsure, try 75-100 (default: 50)") | |
88 p.add_argument("--hcw-s", | |
89 dest="hcw_s", | |
90 type=int, | |
91 metavar='int', | |
92 default=20, | |
93 help="htseq-clip createSlidingWindows -s parameter. Step size for sliding window (default: 20)") | |
94 # More. | |
95 p.add_argument("--no-zipper", | |
96 dest="no_zipper", | |
97 default=False, | |
98 action="store_true", | |
99 help="Do not gzip output files (default: False)") | |
100 return p | |
101 | |
102 | |
103 ################################################################################ | |
104 | |
105 if __name__ == '__main__': | |
106 | |
107 parser = setup_argument_parser() | |
108 args = parser.parse_args() | |
109 | |
110 assert os.path.exists(args.in_gff), "--gff file \"%s\" not found" % (args.in_gff) | |
111 | |
112 # Output folder. | |
113 if not os.path.exists(args.out_folder): | |
114 os.makedirs(args.out_folder) | |
115 | |
116 """ | |
117 1) Flatten annotation. | |
118 htseq-clip annotation -g args.in_gff -o annotation.bed | |
119 | |
120 -o content example: | |
121 Synechocystis 5 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001 2 - | |
122 Synechocystis 576 990 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001 2 + | |
123 Synechocystis 809 909 TU3@TU3@protein_coding@exon@1/1@TU3:exon0001 2 - | |
124 Synechocystis 1531 2150 TU4@TU4@protein_coding@exon@1/1@TU4:exon0001 2 + | |
125 Synechocystis 2150 2701 TU6@TU6@protein_coding@exon@1/1@TU6:exon0001 2 + | |
126 ... | |
127 | |
128 """ | |
129 | |
130 annot_bed = args.out_folder + "/annotation.bed.gz" | |
131 if args.no_zipper: | |
132 annot_bed = args.out_folder + "/annotation.bed" | |
133 | |
134 print("Convert --gff to BED ... ") | |
135 check_cmd = "htseq-clip annotation -g " + args.in_gff + " -o " + annot_bed | |
136 if args.hca_unsorted: | |
137 check_cmd += " --unsorted" | |
138 output = subprocess.getoutput(check_cmd) | |
139 | |
140 print(check_cmd) | |
141 print(output) | |
142 | |
143 assert os.path.exists(annot_bed), "htseq-clip annotation -o file \"%s\" not found" % (annot_bed) | |
144 | |
145 """ | |
146 2) Create sliding windows. | |
147 htseq-clip createSlidingWindows -i annotation.bed -w hcw_w -s hcw_s -o windows.bed.gz | |
148 | |
149 -o content example: | |
150 Synechocystis 5 80 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00001@1 2 - | |
151 Synechocystis 15 90 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00002@2 2 - | |
152 Synechocystis 25 100 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00003@3 2 - | |
153 Synechocystis 35 105 TU1@TU1@protein_coding@exon@1/1@TU1:exon0001W00004@4 2 - | |
154 Synechocystis 576 651 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00001@1 2 + | |
155 Synechocystis 586 661 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00002@2 2 + | |
156 Synechocystis 596 671 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00003@3 2 + | |
157 Synechocystis 606 681 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00004@4 2 + | |
158 Synechocystis 616 691 TU2@TU2@protein_coding@exon@1/1@TU2:exon0001W00005@5 2 + | |
159 ... | |
160 | |
161 """ | |
162 | |
163 win_bed = args.out_folder + "/windows.bed.gz" | |
164 if args.no_zipper: | |
165 win_bed = args.out_folder + "/windows.bed" | |
166 | |
167 print("Create sliding windows BED ... ") | |
168 win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s) | |
169 check_cmd = "htseq-clip createSlidingWindows -i " + annot_bed + win_params + " -o " + win_bed | |
170 output = subprocess.getoutput(check_cmd) | |
171 | |
172 print(check_cmd) | |
173 print(output) | |
174 | |
175 assert os.path.exists(annot_bed), "htseq-clip createSlidingWindows -o file \"%s\" not found" % (win_bed) | |
176 | |
177 """ | |
178 3) Create mapping file for DEWSeq. | |
179 mapToId: extract "name" column from the annotation file and map the entries | |
180 to unique id and print out in tab separated format. | |
181 htseq-clip mapToId -a windows.bed.gz -o windows.txt.gz | |
182 | |
183 -o content example: | |
184 unique_id chromosome begin end strand gene_id gene_name gene_type gene_region Nr_of_region Total_nr_of_region window_number | |
185 TU1:exon0001W00001 Synechocystis 5 80 - TU1 TU1 protein_coding exon 1 1 1 | |
186 TU1:exon0001W00002 Synechocystis 15 90 - TU1 TU1 protein_coding exon 1 1 2 | |
187 TU1:exon0001W00003 Synechocystis 25 100 - TU1 TU1 protein_coding exon 1 1 3 | |
188 TU1:exon0001W00004 Synechocystis 35 105 - TU1 TU1 protein_coding exon 1 1 4 | |
189 TU2:exon0001W00001 Synechocystis 576 651 + TU2 TU2 protein_coding exon 1 1 1 | |
190 TU2:exon0001W00002 Synechocystis 586 661 + TU2 TU2 protein_coding exon 1 1 2 | |
191 TU2:exon0001W00003 Synechocystis 596 671 + TU2 TU2 protein_coding exon 1 1 3 | |
192 TU2:exon0001W00004 Synechocystis 606 681 + TU2 TU2 protein_coding exon 1 1 4 | |
193 TU2:exon0001W00005 Synechocystis 616 691 + TU2 TU2 protein_coding exon 1 1 5 | |
194 ... | |
195 | |
196 """ | |
197 | |
198 mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt.gz" | |
199 if args.no_zipper: | |
200 mapped2ids_txt = args.out_folder + "/windows_mapped_to_ids.txt" | |
201 | |
202 print("Create DEWSeq input annotation file ... ") | |
203 win_params = " -w %i -s %i " % (args.hcw_w, args.hcw_s) | |
204 check_cmd = "htseq-clip mapToId -a " + win_bed + " -o " + mapped2ids_txt | |
205 output = subprocess.getoutput(check_cmd) | |
206 | |
207 print(check_cmd) | |
208 print(output) | |
209 | |
210 assert os.path.exists(mapped2ids_txt), "htseq-clip mapToId -o file \"%s\" not found" % (mapped2ids_txt) | |
211 | |
212 """ | |
213 Report. | |
214 | |
215 """ | |
216 | |
217 print("") | |
218 print("OUTPUT FILES") | |
219 print("============") | |
220 print("Annotation BED:\n%s" % (annot_bed)) | |
221 print("Windows BED:\n%s" % (win_bed)) | |
222 print("Windows mapped to IDs TXT (DEWseq annotation file):\n%s" % (mapped2ids_txt)) | |
223 print("") |