view Snakemake_files/Snakefile_wget_panaroo_heatmap_upset_COG @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
line wrap: on
line source

import glob
import os
import shutil

with open("genbank_ids") as f:
 SAMPLES = f.read().splitlines()

with open("genbank_files") as f:
    for line in f.readlines():
        cmd = "grep 'ACCESSION' "+line
        returned_value = subprocess.getoutput(cmd)
        words = returned_value.split()
        SAMPLES.append(words[1])

rule final:
 input:
  "pav_matrix.csv",
  "GCskew.txt",
  "pav_matrix.tsv",
  "heatmap.svg",
  "cog_output.txt"

rule wget:
    input:
        "genbank_ids"
    output:
        expand("data/{sample}.fasta", sample=SAMPLES),
        expand("data/{sample}.gb", sample=SAMPLES)
    shell:
        """
        perl wget.pl {input} data
        """

rule gcskew:
    input:
        "data/{sample}.fasta" 
    output:
        "data/{sample}.fasta.gcskew.txt"
    shell:
        """
        python3 SkewIT/src/gcskew.py -i {input} -o {input}.gcskew.txt -k 1000 -w 1000
        """ 

rule concat_gcskew:
    input:
        expand("data/{sample}.fasta.gcskew.txt", sample=SAMPLES)
    output:
        out2="GCskew.txt"
    shell:
        """
         cat {input} >>{output.out2}
        """

rule genbank2gff3:
    input:
        "data/{sample}.gb"
    output:
        gff1="data/{sample}.gb.gff",
        gff2="data/{sample}.gb.rmdup.gff"
    shell:
        """
        perl bp_genbank2gff3.pl -o data {input}
        perl remove_duplicates_in_gff.pl {output.gff1} {output.gff2}
        """

rule panaroo:
    input:
        expand("data/{sample}.gb.rmdup.gff", sample=SAMPLES)
    output:
        pav="pav_matrix.csv"
    shell:
        """
        mkdir panaroo_outdir
        panaroo --clean-mode strict -o panaroo_outdir -i data/*gb.rmdup.gff
        cp -rf panaroo_outdir/gene_presence_absence_roary.csv {output.pav}
        """

rule convert_matrix:
    input:
        pav="pav_matrix.csv"
    output:
        "pav_matrix.tsv"
    shell:
        """
        perl ConvertPanarooMatrix.pl data pav_matrix.csv pav_matrix.tsv data/strains.txt        
        """

rule heatmap_upset:
    input:
        pav="pav_matrix.tsv"
    output:
        "heatmap.svg"
    shell:
        """
        perl GenerateHeatmapFromPAV.pl pav_matrix.tsv heatmap.svg
        """

rule cog:
    input:
        pav="pav_matrix.tsv"
    output:
        "cog_output.txt"
    shell:
        """
        perl GetCogOfCluster.pl pav_matrix.tsv data {output}
        """