Mercurial > repos > dereeper > pangenome_explorer
comparison Snakemake_files/Snakefile_wget_pggb_heatmap_upset_COG @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 import glob | |
2 import os | |
3 import shutil | |
4 | |
5 import yaml | |
6 configfile: "config.yaml" | |
7 | |
8 SAMPLES = [] | |
9 with open("config.yaml", "r") as yaml_file: | |
10 genome_data = yaml.safe_load(yaml_file) | |
11 if "ids" in genome_data.keys(): | |
12 for id in genome_data["ids"]: | |
13 SAMPLES.append(id) | |
14 if "input_genbanks" in genome_data.keys(): | |
15 for gb_path in genome_data["input_genbanks"]: | |
16 cmd = "grep 'ACCESSION' "+gb_path | |
17 returned_value = subprocess.getoutput(cmd) | |
18 words = returned_value.split() | |
19 name = words[1] | |
20 #words = returned_value.split() | |
21 #SAMPLES.append(words[1]) | |
22 words = gb_path.split("/") | |
23 genbank_file_name = words[-1] | |
24 x = genbank_file_name.replace(".", "_") | |
25 cmd = "sed -i 's/ACCESSION "+name+"/ACCESSION "+x+"/g' "+gb_path | |
26 subprocess.getoutput(cmd) | |
27 | |
28 cmd_locus = "grep 'LOCUS' "+gb_path | |
29 returned_value = subprocess.getoutput(cmd_locus) | |
30 words_locus = returned_value.split() | |
31 name_locus = words_locus[1] | |
32 cmd = "sed -i 's/LOCUS "+name_locus+"/LOCUS "+x+"/' "+gb_path | |
33 subprocess.getoutput(cmd) | |
34 | |
35 cmd = "grep 'ACCESSION' "+gb_path | |
36 returned_value = subprocess.getoutput(cmd) | |
37 words = returned_value.split() | |
38 SAMPLES.append(words[1]) | |
39 | |
40 | |
41 | |
42 rule final: | |
43 input: | |
44 "outputs/GCskew.txt", | |
45 "outputs/pggb_out/all_genomes.fa.smooth.final.gfa", | |
46 "outputs/pav_matrix.tsv", | |
47 "outputs/heatmap.svg.gz", | |
48 "outputs/cog_output.txt", | |
49 "outputs/rarefaction_curves.txt" | |
50 | |
51 | |
52 rule ncbi_datasets: | |
53 input: | |
54 "config.yaml" | |
55 output: | |
56 expand("outputs/genomes/{sample}.fasta", sample=SAMPLES), | |
57 expand("outputs/genomes/{sample}.gb", sample=SAMPLES), | |
58 expand("outputs/genomes/{sample}.prt", sample=SAMPLES), | |
59 expand("outputs/genomes/{sample}.nuc", sample=SAMPLES), | |
60 genomes="outputs/genomes/genomes.txt", | |
61 strains="outputs/genomes/strains.txt" | |
62 shell: | |
63 """ | |
64 perl $PANEX_PATH/Perl/get_data.pl {input} outputs/genomes | |
65 """ | |
66 | |
67 | |
68 rule gcskew: | |
69 input: | |
70 "outputs/genomes/{sample}.fasta" | |
71 output: | |
72 "outputs/genomes/{sample}.fasta.gcskew.txt" | |
73 shell: | |
74 """ | |
75 python3 $PANEX_PATH/SkewIT/src/gcskew.py -i {input} -o {input}.gcskew.txt -k 1000 -w 1000 | |
76 """ | |
77 | |
78 rule concat_gcskew: | |
79 input: | |
80 expand("outputs/genomes/{sample}.fasta.gcskew.txt", sample=SAMPLES) | |
81 output: | |
82 out2="outputs/GCskew.txt" | |
83 shell: | |
84 """ | |
85 cat {input} >>{output.out2} | |
86 """ | |
87 | |
88 rule genbank2gff3: | |
89 input: | |
90 "outputs/genomes/{sample}.gb" | |
91 output: | |
92 gff1="outputs/genomes/{sample}.gb.gff", | |
93 shell: | |
94 """ | |
95 perl $PANEX_PATH/Perl/bp_genbank2gff3.pl -o outputs/genomes {input} | |
96 """ | |
97 | |
98 rule pggb: | |
99 input: | |
100 expand("outputs/genomes/{sample}.fasta", sample=SAMPLES), | |
101 expand("outputs/genomes/{sample}.gb.gff", sample=SAMPLES), | |
102 output: | |
103 gfa="outputs/pggb_out/all_genomes.fa.smooth.final.gfa", | |
104 png1="outputs/pggb_out/all_genomes.fa.lay.draw.png", | |
105 png2="outputs/pggb_out/all_genomes.fa.og.viz_multiqc.png", | |
106 vcf="outputs/all_genomes.vcf", | |
107 shell: | |
108 """ | |
109 samtools faidx outputs/genomes/all_genomes.fa | |
110 reference=$(head -1 outputs/genomes/strains.txt | awk '{{print $2}}') | |
111 pggb -i outputs/genomes/all_genomes.fa -o outputs/pggb_out -V $reference -m -M | |
112 mv outputs/pggb_out/all_genomes.*smooth.final.gfa {output.gfa} | |
113 mv outputs/pggb_out/all_genomes.*lay.draw.png {output.png1} | |
114 mv outputs/pggb_out/all_genomes.fa.*.og.viz_multiqc.png {output.png2} | |
115 mv outputs/pggb_out/all_genomes.*vcf {output.vcf} | |
116 """ | |
117 | |
118 rule odgi: | |
119 input: | |
120 gfa="outputs/pggb_out/all_genomes.fa.smooth.final.gfa", | |
121 output: | |
122 og="outputs/pggb_out/all_genomes.fa.smooth.final.gfa.og", | |
123 distance="outputs/pggb_out/all_genomes.fa.smooth.final.gfa.og.distance", | |
124 shell: | |
125 """ | |
126 odgi build -g {input} -o {output.og} | |
127 odgi similarity -i {output.og} -d >{output.distance} | |
128 """ | |
129 | |
130 | |
131 rule create_gene_paths: | |
132 input: | |
133 gff="outputs/genomes/{sample}.gb.gff", | |
134 gfa="outputs/pggb_out/all_genomes.fa.smooth.final.gfa", | |
135 output: | |
136 basename="outputs/genomes/{sample}.gene_segments", | |
137 gene_length="outputs/genomes/{sample}.gene_segments.gene_length.txt", | |
138 bed="outputs/genomes/{sample}.gene_segments.bed", | |
139 shell: | |
140 """ | |
141 perl $PANEX_PATH/Perl/CreateGenePathsFromGFA.pl {input.gfa} {input.gff} {output.basename} | |
142 """ | |
143 | |
144 rule bedtools_intersect: | |
145 input: | |
146 strains="outputs/genomes/strains.txt", | |
147 bedfiles=expand("outputs/genomes/{sample}.gene_segments.bed", sample=SAMPLES) | |
148 params: | |
149 identity=30 | |
150 output: | |
151 "outputs/pav_matrix.tsv", | |
152 shell: | |
153 """ | |
154 perl $PANEX_PATH/Perl/GeneratePAVfromBed.pl {input.strains} outputs/genomes {output} {params.identity} | |
155 """ | |
156 | |
157 | |
158 rule cog: | |
159 input: | |
160 pav="outputs/pav_matrix.tsv" | |
161 output: | |
162 cog="outputs/cog_output.txt", | |
163 cogstat="outputs/cog_stats.txt", | |
164 cogstat2="outputs/cog_stats2.txt", | |
165 cogofclusters="outputs/cog_of_clusters.txt" | |
166 shell: | |
167 """ | |
168 perl $PANEX_PATH/Perl/GetCogOfCluster.pl {input} outputs/genomes {output.cog} {output.cogstat} {output.cogstat2} {output.cogofclusters} outputs/genomes/strains.txt | |
169 """ | |
170 | |
171 rule heatmap_upset: | |
172 input: | |
173 pav="outputs/pav_matrix.tsv" | |
174 output: | |
175 heatmap="outputs/heatmap.svg.gz", | |
176 html="outputs/heatmap.svg.heatmap_plotly.html", | |
177 upsetr="outputs/upsetr.svg", | |
178 binpav="outputs/heatmap.svg.pangenes_01matrix.txt" | |
179 shell: | |
180 """ | |
181 perl $PANEX_PATH/Perl/GenerateHeatmapFromPAV.pl {input.pav} outputs/heatmap.svg | |
182 mv outputs/heatmap.svg.upsetr.svg {output.upsetr} | |
183 """ | |
184 | |
185 rule micropan: | |
186 input: | |
187 binpav="outputs/heatmap.svg.pangenes_01matrix.txt" | |
188 output: | |
189 txt="outputs/rarefaction_curves.txt", | |
190 pdf="outputs/rarefaction_curves.pdf", | |
191 svg="outputs/rarefaction_curves.svg", | |
192 heaps="outputs/heaps.tsv" | |
193 shell: | |
194 """ | |
195 Rscript $PANEX_PATH/R/micropan_rarefaction.R -f {input.binpav} -p {output.pdf} -a {output.heaps} -o {output.txt} | |
196 pdf2svg {output.pdf} {output.svg} | |
197 """ |