3
|
1 import glob
|
|
2 import os
|
|
3 import shutil
|
|
4
|
|
5 import yaml
|
|
6 configfile: "config.yaml"
|
|
7
|
|
8 SAMPLES = []
|
|
9 with open("config.yaml", "r") as yaml_file:
|
|
10 genome_data = yaml.safe_load(yaml_file)
|
|
11 for genome_name, genome_info in genome_data["input_genomes"].items():
|
|
12 name = genome_info["name"]
|
|
13 SAMPLES.append(name)
|
|
14
|
|
15
|
|
16
|
|
17 rule final:
|
|
18 input:
|
|
19 expand("outputs/genomes/{sample}.fasta", sample=SAMPLES),
|
|
20 expand("outputs/genomes/{sample}.gff", sample=SAMPLES),
|
|
21 expand("outputs/genomes/{sample}.pep", sample=SAMPLES),
|
|
22 "outputs/pav_matrix.tsv",
|
|
23 "outputs/heatmap.svg.gz",
|
|
24 "outputs/rarefaction_curves.txt"
|
|
25
|
|
26 rule get_data:
|
|
27 input:
|
|
28 "config.yaml"
|
|
29 output:
|
|
30 "outputs/genomes/strains.txt",
|
|
31 expand("outputs/genomes/{sample}.fasta", sample=SAMPLES),
|
|
32 expand("outputs/genomes/{sample}.gff", sample=SAMPLES),
|
|
33 run:
|
|
34 with open(input[0], "r") as yaml_file, open(output[0], "w") as output_file:
|
|
35 genome_data = yaml.safe_load(yaml_file)
|
|
36 for genome_name, genome_info in genome_data["input_genomes"].items():
|
|
37 fasta_file = genome_info["fasta"]
|
|
38 name = genome_info["name"]
|
|
39 gff_file = genome_info["gff3"]
|
|
40 shutil.copy(gff_file, "outputs/genomes/"+str(name)+".gff")
|
|
41 shutil.copy(fasta_file, "outputs/genomes/"+str(name)+".fasta")
|
|
42 output_file.write(f"{name}\t{name}\n")
|
|
43
|
|
44 rule gffread:
|
|
45 input:
|
|
46 fasta = "outputs/genomes/{sample}.fasta",
|
|
47 gff = "outputs/genomes/{sample}.gff"
|
|
48 output:
|
|
49 "outputs/genomes/{sample}.pep"
|
|
50 shell:
|
|
51 """
|
|
52 gffread -y {output} -g {input.fasta} {input.gff}
|
|
53 """
|
|
54
|
|
55 rule orthofinder:
|
|
56 input:
|
|
57 expand("outputs/genomes/{sample}.pep", sample=SAMPLES)
|
|
58 params:
|
|
59 identity=80
|
|
60 output:
|
|
61 pav="outputs/Orthogroups.tsv"
|
|
62 shell:
|
|
63 """
|
|
64 mkdir -p outputs/proteomes
|
|
65 sed -i '/^>/! s/\./N/g' outputs/genomes/*.pep
|
|
66 cp -rf outputs/genomes/*.pep outputs/proteomes
|
|
67 orthofinder -f outputs/proteomes
|
|
68 mv outputs/proteomes/OrthoFinder/R*/Orthogroups/Orthogroups.tsv outputs/Orthogroups.tsv
|
|
69 """
|
|
70
|
|
71 rule convert_matrix:
|
|
72 input:
|
|
73 pav="outputs/Orthogroups.tsv"
|
|
74 output:
|
|
75 "outputs/pav_matrix.tsv"
|
|
76 shell:
|
|
77 """
|
|
78 perl $PANEX_PATH/Perl/ConvertOrthofinderMatrix.pl outputs/genomes {input} {output} outputs/genomes/strains.txt
|
|
79 """
|
|
80
|
|
81 rule heatmap_upset:
|
|
82 input:
|
|
83 pav="outputs/pav_matrix.tsv"
|
|
84 output:
|
|
85 heatmap="outputs/heatmap.svg.gz",
|
|
86 html="outputs/heatmap.svg.heatmap_plotly.html",
|
|
87 upsetr="outputs/upsetr.svg",
|
|
88 binpav="outputs/heatmap.svg.pangenes_01matrix.txt"
|
|
89 shell:
|
|
90 """
|
|
91 perl $PANEX_PATH/Perl/GenerateHeatmapFromPAV.pl {input.pav} outputs/heatmap.svg
|
|
92 mv outputs/heatmap.svg.upsetr.svg {output.upsetr}
|
|
93 """
|
|
94
|
|
95 rule micropan:
|
|
96 input:
|
|
97 binpav="outputs/heatmap.svg.pangenes_01matrix.txt"
|
|
98 output:
|
|
99 txt="outputs/rarefaction_curves.txt",
|
|
100 pdf="outputs/rarefaction_curves.pdf",
|
|
101 svg="outputs/rarefaction_curves.svg",
|
|
102 heaps="outputs/heaps.tsv"
|
|
103 shell:
|
|
104 """
|
|
105 Rscript $PANEX_PATH/R/micropan_rarefaction.R -f {input.binpav} -p {output.pdf} -a {output.heaps} -o {output.txt}
|
|
106 pdf2svg {output.pdf} {output.svg}
|
|
107 """
|
|
108
|