3
|
1 import glob
|
|
2 import os
|
|
3 import shutil
|
|
4
|
|
5 import yaml
|
|
6 configfile: "config.yaml"
|
|
7
|
|
8
|
|
9 SAMPLES = []
|
|
10 with open("config.yaml", "r") as yaml_file:
|
|
11 genome_data = yaml.safe_load(yaml_file)
|
|
12 for genome_name, genome_info in genome_data["input_genomes"].items():
|
|
13 name = genome_info["name"]
|
|
14 SAMPLES.append(name)
|
|
15
|
|
16
|
|
17 rule all:
|
|
18 input:
|
|
19 "outputs/seqfile",
|
|
20 "outputs/pggb_out/all_genomes.fa.smooth.final.gfa",
|
|
21 "outputs/pav_matrix.tsv",
|
|
22 "outputs/heatmap.svg.gz",
|
|
23 "outputs/rarefaction_curves.txt"
|
|
24
|
|
25 rule get_data:
|
|
26 input:
|
|
27 "config.yaml"
|
|
28 output:
|
|
29 "outputs/seqfile",
|
|
30 expand("outputs/genomes/{sample}.fasta", sample=SAMPLES),
|
|
31 expand("outputs/genomes/{sample}.gff", sample=SAMPLES),
|
|
32 run:
|
|
33 with open(input[0], "r") as yaml_file, open(output[0], "w") as output_file:
|
|
34 genome_data = yaml.safe_load(yaml_file)
|
|
35 for genome_name, genome_info in genome_data["input_genomes"].items():
|
|
36 fasta_file = genome_info["fasta"]
|
|
37 name = genome_info["name"]
|
|
38 output_file.write(f"{genome_name}\t{genome_name}\t{fasta_file}\n")
|
|
39 cmd = "sed 's/>/>"+str(name)+"#/g' "+fasta_file+" >outputs/genomes/"+str(name)+".fasta"
|
|
40 os.system(cmd)
|
|
41 gff_file = genome_info["gff3"]
|
|
42 shutil.copy(gff_file, "outputs/genomes/"+str(name)+".gff")
|
|
43
|
|
44 rule pggb:
|
|
45 input:
|
|
46 expand("outputs/genomes/{sample}.fasta", sample=SAMPLES),
|
|
47 expand("outputs/genomes/{sample}.gff", sample=SAMPLES),
|
|
48 output:
|
|
49 gfa="outputs/pggb_out/all_genomes.fa.smooth.final.gfa",
|
|
50 png1="outputs/pggb_out/all_genomes.fa.lay.draw.png",
|
|
51 png2="outputs/pggb_out/all_genomes.fa.og.viz_multiqc.png",
|
|
52 shell:
|
|
53 """
|
|
54 cat outputs/genomes/*fasta >outputs/genomes/all_genomes.fa
|
|
55 samtools faidx outputs/genomes/all_genomes.fa
|
14
|
56 reference=$(head -1 outputs/genomes/all_genomes.fa | awk '{{print $2}}')
|
3
|
57 pggb -i outputs/genomes/all_genomes.fa -o outputs/pggb_out -V $reference -m
|
|
58 mv outputs/pggb_out/all_genomes.*smooth.final.gfa {output.gfa}
|
|
59 mv outputs/pggb_out/all_genomes.*lay.draw.png {output.png1}
|
|
60 mv outputs/pggb_out/all_genomes.fa.*.og.viz_multiqc.png {output.png2}
|
|
61 """
|
|
62
|
|
63 rule create_gene_paths:
|
|
64 input:
|
|
65 gff="outputs/genomes/{sample}.gff",
|
|
66 gfa="outputs/pggb_out/all_genomes.fa.smooth.final.gfa",
|
|
67 output:
|
|
68 basename="outputs/genomes/{sample}.gene_segments",
|
|
69 gene_length="outputs/genomes/{sample}.gene_segments.gene_length.txt",
|
|
70 bed="outputs/genomes/{sample}.gene_segments.bed",
|
|
71 shell:
|
|
72 """
|
|
73 perl $PANEX_PATH/Perl/CreateGenePathsFromGFA.pl {input.gfa} {input.gff} {output.basename}
|
|
74 """
|
|
75
|
|
76 rule bedtools_intersect:
|
|
77 input:
|
|
78 samples="outputs/seqfile",
|
|
79 bedfiles=expand("outputs/genomes/{sample}.gene_segments.bed", sample=SAMPLES)
|
|
80 params:
|
|
81 identity=30
|
|
82 output:
|
|
83 "outputs/pav_matrix.tsv",
|
|
84 shell:
|
|
85 """
|
|
86 perl $PANEX_PATH/Perl/GeneratePAVfromBed.pl {input.samples} outputs/genomes {output} {params.identity}
|
|
87 """
|
|
88
|
|
89 rule heatmap_upset:
|
|
90 input:
|
|
91 pav="outputs/pav_matrix.tsv"
|
|
92 output:
|
|
93 heatmap="outputs/heatmap.svg.gz",
|
|
94 html="outputs/heatmap.svg.heatmap_plotly.html",
|
|
95 upsetr="outputs/upsetr.svg",
|
|
96 binpav="outputs/heatmap.svg.pangenes_01matrix.txt"
|
|
97 shell:
|
|
98 """
|
|
99 perl $PANEX_PATH/Perl/GenerateHeatmapFromPAV.pl {input.pav} outputs/heatmap.svg
|
|
100 mv outputs/heatmap.svg.upsetr.svg {output.upsetr}
|
|
101 """
|
|
102
|
|
103 rule micropan:
|
|
104 input:
|
|
105 binpav="outputs/heatmap.svg.pangenes_01matrix.txt"
|
|
106 output:
|
|
107 txt="outputs/rarefaction_curves.txt",
|
|
108 pdf="outputs/rarefaction_curves.pdf",
|
|
109 svg="outputs/rarefaction_curves.svg",
|
|
110 heaps="outputs/heaps.tsv"
|
|
111 shell:
|
|
112 """
|
|
113 Rscript $PANEX_PATH/R/micropan_rarefaction.R -f {input.binpav} -p {output.pdf} -a {output.heaps} -o {output.txt}
|
|
114 pdf2svg {output.pdf} {output.svg}
|
|
115 """
|