Mercurial > repos > dereeper > pangenome_explorer
view Snakemake_files/Snakefile_wget_panaroo_heatmap_upset_COG @ 10:d103c41b6931 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 16:35:22 +0000 |
parents | e42d30da7a74 |
children |
line wrap: on
line source
import glob import os import shutil with open("genbank_ids") as f: SAMPLES = f.read().splitlines() with open("genbank_files") as f: for line in f.readlines(): cmd = "grep 'ACCESSION' "+line returned_value = subprocess.getoutput(cmd) words = returned_value.split() SAMPLES.append(words[1]) rule final: input: "pav_matrix.csv", "GCskew.txt", "pav_matrix.tsv", "heatmap.svg", "cog_output.txt" rule wget: input: "genbank_ids" output: expand("data/{sample}.fasta", sample=SAMPLES), expand("data/{sample}.gb", sample=SAMPLES) shell: """ perl wget.pl {input} data """ rule gcskew: input: "data/{sample}.fasta" output: "data/{sample}.fasta.gcskew.txt" shell: """ python3 SkewIT/src/gcskew.py -i {input} -o {input}.gcskew.txt -k 1000 -w 1000 """ rule concat_gcskew: input: expand("data/{sample}.fasta.gcskew.txt", sample=SAMPLES) output: out2="GCskew.txt" shell: """ cat {input} >>{output.out2} """ rule genbank2gff3: input: "data/{sample}.gb" output: gff1="data/{sample}.gb.gff", gff2="data/{sample}.gb.rmdup.gff" shell: """ perl bp_genbank2gff3.pl -o data {input} perl remove_duplicates_in_gff.pl {output.gff1} {output.gff2} """ rule panaroo: input: expand("data/{sample}.gb.rmdup.gff", sample=SAMPLES) output: pav="pav_matrix.csv" shell: """ mkdir panaroo_outdir panaroo --clean-mode strict -o panaroo_outdir -i data/*gb.rmdup.gff cp -rf panaroo_outdir/gene_presence_absence_roary.csv {output.pav} """ rule convert_matrix: input: pav="pav_matrix.csv" output: "pav_matrix.tsv" shell: """ perl ConvertPanarooMatrix.pl data pav_matrix.csv pav_matrix.tsv data/strains.txt """ rule heatmap_upset: input: pav="pav_matrix.tsv" output: "heatmap.svg" shell: """ perl GenerateHeatmapFromPAV.pl pav_matrix.tsv heatmap.svg """ rule cog: input: pav="pav_matrix.tsv" output: "cog_output.txt" shell: """ perl GetCogOfCluster.pl pav_matrix.tsv data {output} """