# HG changeset patch # User nikhil-joshi # Date 1357774752 18000 # Node ID a49aff09553e08202c19ad131d4371dedd703b90 # Parent 669899d20e5929d6da61db70fb1142c39edc9170 Uploaded diff -r 669899d20e59 -r a49aff09553e deseq/README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/README.md Wed Jan 09 18:39:12 2013 -0500 @@ -0,0 +1,23 @@ +# sam2counts and DESeq in Galaxy + +## About + +This is a Galaxy package that wraps sam2counts and DESeq for RNA-Seq analysis using a transcriptome reference. sam2counts takes SAM or BAM files that are created from an alignment to a transcriptome and creates counts of aligned reads for each transcript. DESeq uses the DESeq package from Bioconductor in R and analyzes the count data from sam2counts. DESeq outputs a toptable of transcripts sorted by adjusted p-value and a page of diagnostic plots. + +## Requirements + +Python 2.6.5 +pysam 0.6 (package for Python) +R 2.15 +Bioconductor 2.10 (package for R) +DESeq 1.8.3 (package for R) +aroma.light 1.24.0 (package for R) +lattice 0.20-6 (package for R) + +## Installation + +stderr_wrapper.py and sam2counts_galaxy.py must be in the path or they can remain in the tools directory with the xml files. deseq.R must be copied to the "tool-data" directory under the main Galaxy install directory. + +## Use + +sam2counts needs a SAM file (produced by aligning to a transcriptome) with header information as the input. The count data produced from this SAM file gets fed into DESeq. diff -r 669899d20e59 -r a49aff09553e deseq/deseq.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/deseq.R Wed Jan 09 18:39:12 2013 -0500 @@ -0,0 +1,195 @@ +## Run DESeq (oversimplified) in Galaxy. +## Format: Rscript deseq3.R tab-delimited-counts.txt comma,delimited,col,names,except,first,gene,id,col comparison,here outputfile +## +## The incoming data must have the first column be the gene names, and +## the rest raw counts. The next argument is the list of groups +## (i.e. liver,liver,kidney,kidney), and the final argument is the +## comparison, i.e. kidney,liver. This produces a top-table called +## top-table.txt ordered by p-value. + +cargs <- commandArgs() +cargs <- cargs[(which(cargs == "--args")+1):length(cargs)] + +countstable <- cargs[1] +conds <- unlist(strsplit(cargs[2], ',')) +comparison <- unlist(strsplit(cargs[3], ',')) +outputfile <- cargs[4] +diag.html = cargs[5] +temp.files.path = cargs[6] +counts.name = cargs[7] + +# the comparison must only have two values and the conds must +# be a vector from those values, at least one of each. +if (length(unique(conds)) != 2) { + stop("You can only have two columns types: ", cargs[2]) +} + +if (length(comparison) != 2) { + stop("Comparison type must be a tuple: ", cargs[3]) +} + +if (!identical(sort(comparison), sort(unique(conds)))) { + stop("Column types must use the two names from Comparison type, and vice versa. Must have at least one of each in the Column types.\nColumn types: ", cargs[2], "\n", "Comparison type: ", cargs[3]) +} + +sink("/dev/null") +dir.create(temp.files.path, recursive=TRUE) +library(DESeq) + +d <- read.table(countstable, sep="\t", header=TRUE, row.names=1) + +if (length(d) != length(conds)) { + stop("Number of total sample columns in counts file must correspond to the columns types field. E.g. if column types is 'kidney,kidney,liver,liver' then number of sample columns in counts file must be 4 as well.") +} + +cds <- newCountDataSet(d, conds) +cds <- estimateSizeFactors(cds) + +cdsBlind <- estimateDispersions( cds, method="blind" ) + +if (length(conds) != 2) { + cds <- estimateDispersions( cds ) + norep = FALSE +} + +if (length(conds) == 2) { + cds <- estimateDispersions( cds, method="blind", sharingMode="fit-only" ) + norep = TRUE +} + +plotDispEsts <- function( cds ) +{ + plot( + rowMeans( counts( cds, normalized=TRUE ) ), + fitInfo(cds)$perGeneDispEsts, + pch = '.', log="xy" ) + xg <- 10^seq( -.5, 5, length.out=300 ) + lines( xg, fitInfo(cds)$dispFun( xg ), col="red" ) + } + +temp.disp.est.plot = file.path( temp.files.path, "DispersionEstimatePlot.png" ) +png( temp.disp.est.plot, width=500, height=500 ) +plotDispEsts( cds ) +dev.off() + +res <- nbinomTest(cds, comparison[1], comparison[2]) + +write.table(res[order(res$padj), ], file=outputfile, quote=FALSE, row.names=FALSE, sep="\t") + + +plotDE <- function( res ) + plot( + res$baseMean, + res$log2FoldChange, + log="x", pch=20, cex=.3, + col = ifelse( res$padj < .1, "red", "black" ) ) + +temp.de.plot = file.path( temp.files.path, "DiffExpPlot.png") +png( temp.de.plot, width=500, height=500 ) +plotDE( res ) +dev.off() + + +temp.pval.plot = file.path( temp.files.path, "PvalHist.png") +png( temp.pval.plot, width=500, height=500 ) +hist(res$pval, breaks=100, col="skyblue", border="slateblue", main="") +dev.off() + +vsd <- getVarianceStabilizedData( cdsBlind ) + +temp.heatmap.plot = file.path( temp.files.path, "heatmap.png" ) +png( temp.heatmap.plot, width=700, height=700 ) +select <- order(res$pval)[1:100] +colors <- colorRampPalette(c("white","darkblue"))(100) +heatmap( vsd[select,], col = colors, scale = "none", Rowv=NULL, main="", margins=c(20,20)) +dev.off() + + +mod_lfc <- (rowMeans( vsd[, conditions(cds)==comparison[2], drop=FALSE] ) - rowMeans( vsd[, conditions(cds)==comparison[1], drop=FALSE] )) +lfc <- res$log2FoldChange +finite <- is.finite(lfc) +table(as.character(lfc[!finite]), useNA="always") +largeNumber <- 10 +lfc <- ifelse(finite, lfc, sign(lfc) * largeNumber) + +temp.modlfc.plot = file.path( temp.files.path, "modlfc.png" ) +png( temp.modlfc.plot, width=500, height=500 ) +plot( lfc, mod_lfc, pch=20, cex=.3, col = ifelse( finite, "#80808040", "red" ) ) +abline( a=0, b=1, col="#40404040" ) +dev.off() + +temp.sampclust.plot = file.path( temp.files.path, "sampclust.png" ) +png( temp.sampclust.plot, width=700, height=700 ) +dists <- dist( t( vsd ) ) +heatmap( as.matrix( dists ), + symm=TRUE, scale="none", margins=c(20,20), + col = colorRampPalette(c("darkblue","white"))(100), + labRow = paste( pData(cdsBlind)$condition, pData(cdsBlind)$type ) ) +dev.off() + + +library(aroma.light) +library(lattice) + +mdsPlot <- function(x, conds=NULL, cex=1, ...) { + d <- dist(x) + + mds.fit <- cmdscale(d, eig=TRUE, k=2) + + mds.d <- data.frame(x1=mds.fit$points[, 1], + x2=mds.fit$points[, 2], + labels=rownames(x)) + if (!is.null(conds)) + mds.d$treatment <- as.factor(conds) + + if (!is.null(conds)) { + p <- xyplot(x2 ~ x1, group=treatment, data=mds.d, panel=function(x, y, ..., groups, subscripts) { + panel.text(x, y, mds.d$labels[subscripts], cex=cex, col=trellis.par.get()$superpose.line$col[groups]) + }, ...) + } else { + p <- xyplot(x2 ~ x1, data=mds.d, panel=function(x, y, ..., groups, subscripts) { + panel.text(x, y, mds.d$labels[subscripts], cex=cex) + }, ...) + } + return(p) +} + +if (!norep) { + dn = normalizeQuantileRank(as.matrix(d)) + p = mdsPlot(t(log10(dn+1)), conds=conds, xlab="dimension 1", ylab="dimension 2", main="") + + temp.mds.plot = file.path( temp.files.path, "mds.png" ) + png( temp.mds.plot, width=500, height=500 ) + print(p) + dev.off() +} + +file.conn = file(diag.html, open="w") +writeLines( c(""), file.conn) +writeLines( c("

Diagnostic Plots for ", counts.name, "

"), file.conn) +writeLines( c("

Dispersion Estimates

"), file.conn) +writeLines( c("

"), file.conn) +writeLines( c("

Differential Expression - Base Mean vs. Log2 Fold Change

"), file.conn) +writeLines( c("

"), file.conn) +writeLines( c("

P-value histogram

"), file.conn) +writeLines( c("

"), file.conn) +writeLines( c("

Top 100 Genes/Transcripts by P-value

"), file.conn) +writeLines( c("

"), file.conn) +writeLines( c("

Moderated LFC vs. LFC

"), file.conn) +writeLines( c("

"), file.conn) +writeLines( c("

VST Sample Clustering

"), file.conn) +writeLines( c("

"), file.conn) +writeLines( c("

MDS Plot

"), file.conn) + +if (!norep) { + writeLines( c("

"), file.conn) +} + +if (norep) { + writeLines( c("

MDS plot not produced for unreplicated data

"), file.conn) +} + +writeLines( c(""), file.conn) +close(file.conn) + +sink(NULL) diff -r 669899d20e59 -r a49aff09553e deseq/deseq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/deseq.xml Wed Jan 09 18:39:12 2013 -0500 @@ -0,0 +1,31 @@ + + Run Differential Expression analysis from SAM To Count data + + + stderr_wrapper.py Rscript ${GALAXY_DATA_INDEX_DIR}/deseq.R $counts $column_types $comparison $top_table $diagnostic_html "$diagnostic_html.files_path" "$counts.name" + + + + + + + + ^(\w+,)+\w+$ + + + + + ^\w+,\w+$ + + + + + + + + + + NOTE: This DEseq Galaxy tool can only be run on counts files that are created from SAM files that have been aligned to a transcriptome. + + + diff -r 669899d20e59 -r a49aff09553e deseq/deseq/deseq.R --- a/deseq/deseq/deseq.R Wed Jan 09 18:27:59 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,195 +0,0 @@ -## Run DESeq (oversimplified) in Galaxy. -## Format: Rscript deseq3.R tab-delimited-counts.txt comma,delimited,col,names,except,first,gene,id,col comparison,here outputfile -## -## The incoming data must have the first column be the gene names, and -## the rest raw counts. The next argument is the list of groups -## (i.e. liver,liver,kidney,kidney), and the final argument is the -## comparison, i.e. kidney,liver. This produces a top-table called -## top-table.txt ordered by p-value. - -cargs <- commandArgs() -cargs <- cargs[(which(cargs == "--args")+1):length(cargs)] - -countstable <- cargs[1] -conds <- unlist(strsplit(cargs[2], ',')) -comparison <- unlist(strsplit(cargs[3], ',')) -outputfile <- cargs[4] -diag.html = cargs[5] -temp.files.path = cargs[6] -counts.name = cargs[7] - -# the comparison must only have two values and the conds must -# be a vector from those values, at least one of each. -if (length(unique(conds)) != 2) { - stop("You can only have two columns types: ", cargs[2]) -} - -if (length(comparison) != 2) { - stop("Comparison type must be a tuple: ", cargs[3]) -} - -if (!identical(sort(comparison), sort(unique(conds)))) { - stop("Column types must use the two names from Comparison type, and vice versa. Must have at least one of each in the Column types.\nColumn types: ", cargs[2], "\n", "Comparison type: ", cargs[3]) -} - -sink("/dev/null") -dir.create(temp.files.path, recursive=TRUE) -library(DESeq) - -d <- read.table(countstable, sep="\t", header=TRUE, row.names=1) - -if (length(d) != length(conds)) { - stop("Number of total sample columns in counts file must correspond to the columns types field. E.g. if column types is 'kidney,kidney,liver,liver' then number of sample columns in counts file must be 4 as well.") -} - -cds <- newCountDataSet(d, conds) -cds <- estimateSizeFactors(cds) - -cdsBlind <- estimateDispersions( cds, method="blind" ) - -if (length(conds) != 2) { - cds <- estimateDispersions( cds ) - norep = FALSE -} - -if (length(conds) == 2) { - cds <- estimateDispersions( cds, method="blind", sharingMode="fit-only" ) - norep = TRUE -} - -plotDispEsts <- function( cds ) -{ - plot( - rowMeans( counts( cds, normalized=TRUE ) ), - fitInfo(cds)$perGeneDispEsts, - pch = '.', log="xy" ) - xg <- 10^seq( -.5, 5, length.out=300 ) - lines( xg, fitInfo(cds)$dispFun( xg ), col="red" ) - } - -temp.disp.est.plot = file.path( temp.files.path, "DispersionEstimatePlot.png" ) -png( temp.disp.est.plot, width=500, height=500 ) -plotDispEsts( cds ) -dev.off() - -res <- nbinomTest(cds, comparison[1], comparison[2]) - -write.table(res[order(res$padj), ], file=outputfile, quote=FALSE, row.names=FALSE, sep="\t") - - -plotDE <- function( res ) - plot( - res$baseMean, - res$log2FoldChange, - log="x", pch=20, cex=.3, - col = ifelse( res$padj < .1, "red", "black" ) ) - -temp.de.plot = file.path( temp.files.path, "DiffExpPlot.png") -png( temp.de.plot, width=500, height=500 ) -plotDE( res ) -dev.off() - - -temp.pval.plot = file.path( temp.files.path, "PvalHist.png") -png( temp.pval.plot, width=500, height=500 ) -hist(res$pval, breaks=100, col="skyblue", border="slateblue", main="") -dev.off() - -vsd <- getVarianceStabilizedData( cdsBlind ) - -temp.heatmap.plot = file.path( temp.files.path, "heatmap.png" ) -png( temp.heatmap.plot, width=700, height=700 ) -select <- order(res$pval)[1:100] -colors <- colorRampPalette(c("white","darkblue"))(100) -heatmap( vsd[select,], col = colors, scale = "none", Rowv=NULL, main="", margins=c(20,20)) -dev.off() - - -mod_lfc <- (rowMeans( vsd[, conditions(cds)==comparison[2], drop=FALSE] ) - rowMeans( vsd[, conditions(cds)==comparison[1], drop=FALSE] )) -lfc <- res$log2FoldChange -finite <- is.finite(lfc) -table(as.character(lfc[!finite]), useNA="always") -largeNumber <- 10 -lfc <- ifelse(finite, lfc, sign(lfc) * largeNumber) - -temp.modlfc.plot = file.path( temp.files.path, "modlfc.png" ) -png( temp.modlfc.plot, width=500, height=500 ) -plot( lfc, mod_lfc, pch=20, cex=.3, col = ifelse( finite, "#80808040", "red" ) ) -abline( a=0, b=1, col="#40404040" ) -dev.off() - -temp.sampclust.plot = file.path( temp.files.path, "sampclust.png" ) -png( temp.sampclust.plot, width=700, height=700 ) -dists <- dist( t( vsd ) ) -heatmap( as.matrix( dists ), - symm=TRUE, scale="none", margins=c(20,20), - col = colorRampPalette(c("darkblue","white"))(100), - labRow = paste( pData(cdsBlind)$condition, pData(cdsBlind)$type ) ) -dev.off() - - -library(aroma.light) -library(lattice) - -mdsPlot <- function(x, conds=NULL, cex=1, ...) { - d <- dist(x) - - mds.fit <- cmdscale(d, eig=TRUE, k=2) - - mds.d <- data.frame(x1=mds.fit$points[, 1], - x2=mds.fit$points[, 2], - labels=rownames(x)) - if (!is.null(conds)) - mds.d$treatment <- as.factor(conds) - - if (!is.null(conds)) { - p <- xyplot(x2 ~ x1, group=treatment, data=mds.d, panel=function(x, y, ..., groups, subscripts) { - panel.text(x, y, mds.d$labels[subscripts], cex=cex, col=trellis.par.get()$superpose.line$col[groups]) - }, ...) - } else { - p <- xyplot(x2 ~ x1, data=mds.d, panel=function(x, y, ..., groups, subscripts) { - panel.text(x, y, mds.d$labels[subscripts], cex=cex) - }, ...) - } - return(p) -} - -if (!norep) { - dn = normalizeQuantileRank(as.matrix(d)) - p = mdsPlot(t(log10(dn+1)), conds=conds, xlab="dimension 1", ylab="dimension 2", main="") - - temp.mds.plot = file.path( temp.files.path, "mds.png" ) - png( temp.mds.plot, width=500, height=500 ) - print(p) - dev.off() -} - -file.conn = file(diag.html, open="w") -writeLines( c(""), file.conn) -writeLines( c("

Diagnostic Plots for ", counts.name, "

"), file.conn) -writeLines( c("

Dispersion Estimates

"), file.conn) -writeLines( c("

"), file.conn) -writeLines( c("

Differential Expression - Base Mean vs. Log2 Fold Change

"), file.conn) -writeLines( c("

"), file.conn) -writeLines( c("

P-value histogram

"), file.conn) -writeLines( c("

"), file.conn) -writeLines( c("

Top 100 Genes/Transcripts by P-value

"), file.conn) -writeLines( c("

"), file.conn) -writeLines( c("

Moderated LFC vs. LFC

"), file.conn) -writeLines( c("

"), file.conn) -writeLines( c("

VST Sample Clustering

"), file.conn) -writeLines( c("

"), file.conn) -writeLines( c("

MDS Plot

"), file.conn) - -if (!norep) { - writeLines( c("

"), file.conn) -} - -if (norep) { - writeLines( c("

MDS plot not produced for unreplicated data

"), file.conn) -} - -writeLines( c(""), file.conn) -close(file.conn) - -sink(NULL) diff -r 669899d20e59 -r a49aff09553e deseq/deseq/deseq.xml --- a/deseq/deseq/deseq.xml Wed Jan 09 18:27:59 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ - - Run Differential Expression analysis from SAM To Count data - - - stderr_wrapper.py Rscript ${GALAXY_DATA_INDEX_DIR}/deseq.R $counts $column_types $comparison $top_table $diagnostic_html "$diagnostic_html.files_path" "$counts.name" - - - - - - - - ^(\w+,)+\w+$ - - - - - ^\w+,\w+$ - - - - - - - - - - NOTE: This DEseq Galaxy tool can only be run on counts files that are created from SAM files that have been aligned to a transcriptome. - - - diff -r 669899d20e59 -r a49aff09553e deseq/deseq/sam2counts.xml --- a/deseq/deseq/sam2counts.xml Wed Jan 09 18:27:59 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ - - Produce count data from SAM or BAM files - - - sam2counts_galaxy.py - - - - sam2counts_galaxy.py - - #if str($filetype.ftype_select) == "sam": - ${first_input_sam} - #for $input_file_sam in $filetype.input_files_sam: - ${input_file_sam.additional_input_sam} - #end for - #end if - - #if str($filetype.ftype_select) == "bam": - -b - - ${first_input_bam} - #for $input_file_bam in $filetype.input_files_bam: - ${input_file_bam.additional_input_bam} - #end for - #end if - - -o $counts - -l $colnames - - - - - - ^(\w+,)+\w+$ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r 669899d20e59 -r a49aff09553e deseq/deseq/sam2counts_galaxy.py --- a/deseq/deseq/sam2counts_galaxy.py Wed Jan 09 18:27:59 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,209 +0,0 @@ -#!/usr/bin/env python - -""" -count.py -- Take SAM files and output a table of counts with column -names that are the filenames, and rowname that are the reference -names. - -Author: Vince Buffalo -Email: vsbuffaloAAAAA@gmail.com (with poly-A tail removed) -""" - -VERSION = 0.91 - -import sys -import csv -from os import path -try: - import pysam -except ImportError: - sys.exit("pysam not installed; please install it\n") - -from optparse import OptionParser - -def SAM_file_to_counts(filename, bam=False, extra=False, use_all_references=True): - """ - Take SAM filename, and create a hash of mapped and unmapped reads; - keys are reference sequences, values are the counts of occurences. - - Also, a hash of qualities (either 0 or >0) of mapped reads - is output, which is handy for diagnostics. - """ - counts = dict() - unique = dict() - nonunique = dict() - mode = 'r' - if bam: - mode = 'rb' - sf = pysam.Samfile(filename, mode) - - if use_all_references: - # Make dictionary of all entries in header - for sn in sf.header['SQ']: - if extra: - unique[sn['SN']] = 0 - nonunique[sn['SN']] = 0 - counts[sn['SN']] = 0 - - for read in sf: - if not read.is_unmapped: - id_name = sf.getrname(read.rname) if read.rname != -1 else 0 - - if not use_all_references and not counts.get(id_name, False): - ## Only make keys based on aligning reads, make empty hash - if extra: - unique[id_name] = 0 - nonunique[id_name] = 0 - ## initiate entry; even if not mapped, record 0 count - counts[id_name] = counts.get(id_name, 0) - - - counts[id_name] = counts.get(id_name, 0) + 1 - - if extra: - if read.mapq == 0: - nonunique[id_name] = nonunique[id_name] + 1 - else: - unique[id_name] = unique[id_name] + 1 - - if extra: - return {'counts':counts, 'unique':unique, 'nonunique':nonunique} - - return {'counts':counts} - -def collapsed_nested_count_dict(counts_dict, all_ids, order=None): - """ - Takes a nested dictionary `counts_dict` and `all_ids`, which is - built with the `table_dict`. All files (first keys) in - `counts_dict` are made into columns with order specified by - `order`. - - Output is a dictionary with keys that are the id's (genes or - transcripts), with values that are ordered counts. A header will - be created on the first row from the ordered columns (extracted - from filenames). - """ - if order is None: - col_order = counts_dict.keys() - else: - col_order = order - - collapsed_dict = dict() - for i, filename in enumerate(col_order): - for id_name in all_ids: - if not collapsed_dict.get(id_name, False): - collapsed_dict[id_name] = list() - - # get counts and append - c = counts_dict[filename].get(id_name, 0) - collapsed_dict[id_name].append(c) - return {'table':collapsed_dict, 'header':col_order} - - -def counts_to_file(table_dict, outfilename, delimiter=',', labels=''): - """ - A function for its side-effect of writing `table_dict` (which - contains a table and header), to `outfilename` with the specified - `delimiter`. - """ - writer = csv.writer(open(outfilename, 'w'), delimiter=delimiter) - table = table_dict['table'] - if labels: - header = labels.split(',') - else: - header = table_dict['header'] - - header_row = True - for id_name, fields in table.items(): - if header_row: - row = ['id'] + header - writer.writerow(row) - header_row = False - - if id_name == 0: - continue - row = [id_name] - row.extend(fields) - writer.writerow(row) - -if __name__ == '__main__': - parser = OptionParser() - parser.add_option("-d", "--delimiter", dest="delimiter", - help="the delimiter (default: tab)", default='\t') - parser.add_option("-o", "--out-file", dest="out_file", - help="output filename (default: counts.txt)", - default='counts.txt', action="store", type="string") - parser.add_option("-u", "--extra-output", dest="extra_out", - help="output extra information on non-unique and unique mappers (default: False)", - default=False, action="store_true") - parser.add_option("-b", "--bam", dest="bam", - help="all input files are BAM (default: False)", - default=False, action="store_true") - parser.add_option("-r", "--use-all-references", dest="use_all_references", - help="Use all the references from the SAM header (default: True)", - default=True, action="store_false") - parser.add_option("-f", "--extra-out-files", dest="extra_out_files", - help="comma-delimited filenames of unique and non-unique output " - "(default: unique.txt,nonunique.txt)", - default='unique.txt,nonunique.txt', action="store", type="string") - parser.add_option("-v", "--verbose", dest="verbose", - help="enable verbose output") - parser.add_option("-l", "--columns-labels", dest="col_labels", help="comma-delimited label names for samples", action="store", type="string") - - (options, args) = parser.parse_args() - - if len(args) < 1: - parser.error("one or more SAM files as arguments required") - - file_counts = dict() - file_unique_counts = dict() - file_nonunique_counts = dict() - all_ids = list() - files = [path.basename(f) for f in args] - - if options.col_labels and len(files) != len(options.col_labels.split(',')): - parser.error("Number of sample names does not equal number of files") - - if len(set(files)) != len(set(args)): - parser.error("file args must have unique base names (i.e. no foo/bar joo/bar)") - - ## do a pre-run check that all files exist - for full_filename in args: - if not path.exists(full_filename): - parser.error("file '%s' does not exist" % full_filename) - - for full_filename in args: - filename = path.basename(full_filename) - ## read in SAM file, extract counts, and unpack counts - tmp = SAM_file_to_counts(full_filename, bam=options.bam, extra=options.extra_out, - use_all_references=options.use_all_references) - - if options.extra_out: - counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique'] - else: - counts = tmp['counts'] - - ## save counts, and unique/non-unique counts - file_counts[filename] = counts - - if options.extra_out: - file_unique_counts[filename] = unique - file_nonunique_counts[filename] = nonunique - - ## add all ids encountered in this in this file - all_ids.extend(file_counts[filename].keys()) - - ## Uniquify all_ids, and then take the nested file_counts - ## dictionary, collapse, and write to file. - all_ids = set(all_ids) - table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=files) - counts_to_file(table_dict, options.out_file, delimiter=options.delimiter, labels=options.col_labels) - - if options.extra_out: - unique_fn, nonunique_fn = options.extra_out_files.split(',') - unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files) - nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files) - - counts_to_file(unique_table_dict, unique_fn, delimiter=options.delimiter) - counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=options.delimiter) - diff -r 669899d20e59 -r a49aff09553e deseq/deseq/stderr_wrapper.py --- a/deseq/deseq/stderr_wrapper.py Wed Jan 09 18:27:59 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -#!/usr/bin/python - -""" -Wrapper that executes a program with its arguments but reports standard error -messages only if the program exit status was not 0. This is useful to prevent -Galaxy to interpret that there was an error if something was printed on stderr, -e.g. if this was simply a warning. -Example: ./stderr_wrapper.py myprog arg1 -f arg2 -Author: Florent Angly -""" - -import sys, subprocess - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - # Get command-line arguments - args = sys.argv - # Remove name of calling program, i.e. ./stderr_wrapper.py - args.pop(0) - # If there are no arguments left, we're done - if len(args) == 0: - return - - # If one needs to silence stdout - args.append( ">" ) - args.append( "/dev/null" ) - - #cmdline = " ".join(args) - #print cmdline - try: - # Run program - proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE ) - returncode = proc.wait() - # Capture stderr, allowing for case where it's very large - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += proc.stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - # Running Grinder failed: write error message to stderr - if returncode != 0: - raise Exception, stderr - except Exception, e: - # Running Grinder failed: write error message to stderr - stop_err( 'Error: ' + str( e ) ) - - -if __name__ == "__main__": __main__() diff -r 669899d20e59 -r a49aff09553e deseq/sam2counts.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/sam2counts.xml Wed Jan 09 18:39:12 2013 -0500 @@ -0,0 +1,68 @@ + + Produce count data from SAM or BAM files + + + sam2counts_galaxy.py + + + + sam2counts_galaxy.py + + #if str($filetype.ftype_select) == "sam": + ${first_input_sam} + #for $input_file_sam in $filetype.input_files_sam: + ${input_file_sam.additional_input_sam} + #end for + #end if + + #if str($filetype.ftype_select) == "bam": + -b + + ${first_input_bam} + #for $input_file_bam in $filetype.input_files_bam: + ${input_file_bam.additional_input_bam} + #end for + #end if + + -o $counts + -l $colnames + + + + + + ^(\w+,)+\w+$ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 669899d20e59 -r a49aff09553e deseq/sam2counts_galaxy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/sam2counts_galaxy.py Wed Jan 09 18:39:12 2013 -0500 @@ -0,0 +1,209 @@ +#!/usr/bin/env python + +""" +count.py -- Take SAM files and output a table of counts with column +names that are the filenames, and rowname that are the reference +names. + +Author: Vince Buffalo +Email: vsbuffaloAAAAA@gmail.com (with poly-A tail removed) +""" + +VERSION = 0.91 + +import sys +import csv +from os import path +try: + import pysam +except ImportError: + sys.exit("pysam not installed; please install it\n") + +from optparse import OptionParser + +def SAM_file_to_counts(filename, bam=False, extra=False, use_all_references=True): + """ + Take SAM filename, and create a hash of mapped and unmapped reads; + keys are reference sequences, values are the counts of occurences. + + Also, a hash of qualities (either 0 or >0) of mapped reads + is output, which is handy for diagnostics. + """ + counts = dict() + unique = dict() + nonunique = dict() + mode = 'r' + if bam: + mode = 'rb' + sf = pysam.Samfile(filename, mode) + + if use_all_references: + # Make dictionary of all entries in header + for sn in sf.header['SQ']: + if extra: + unique[sn['SN']] = 0 + nonunique[sn['SN']] = 0 + counts[sn['SN']] = 0 + + for read in sf: + if not read.is_unmapped: + id_name = sf.getrname(read.rname) if read.rname != -1 else 0 + + if not use_all_references and not counts.get(id_name, False): + ## Only make keys based on aligning reads, make empty hash + if extra: + unique[id_name] = 0 + nonunique[id_name] = 0 + ## initiate entry; even if not mapped, record 0 count + counts[id_name] = counts.get(id_name, 0) + + + counts[id_name] = counts.get(id_name, 0) + 1 + + if extra: + if read.mapq == 0: + nonunique[id_name] = nonunique[id_name] + 1 + else: + unique[id_name] = unique[id_name] + 1 + + if extra: + return {'counts':counts, 'unique':unique, 'nonunique':nonunique} + + return {'counts':counts} + +def collapsed_nested_count_dict(counts_dict, all_ids, order=None): + """ + Takes a nested dictionary `counts_dict` and `all_ids`, which is + built with the `table_dict`. All files (first keys) in + `counts_dict` are made into columns with order specified by + `order`. + + Output is a dictionary with keys that are the id's (genes or + transcripts), with values that are ordered counts. A header will + be created on the first row from the ordered columns (extracted + from filenames). + """ + if order is None: + col_order = counts_dict.keys() + else: + col_order = order + + collapsed_dict = dict() + for i, filename in enumerate(col_order): + for id_name in all_ids: + if not collapsed_dict.get(id_name, False): + collapsed_dict[id_name] = list() + + # get counts and append + c = counts_dict[filename].get(id_name, 0) + collapsed_dict[id_name].append(c) + return {'table':collapsed_dict, 'header':col_order} + + +def counts_to_file(table_dict, outfilename, delimiter=',', labels=''): + """ + A function for its side-effect of writing `table_dict` (which + contains a table and header), to `outfilename` with the specified + `delimiter`. + """ + writer = csv.writer(open(outfilename, 'w'), delimiter=delimiter) + table = table_dict['table'] + if labels: + header = labels.split(',') + else: + header = table_dict['header'] + + header_row = True + for id_name, fields in table.items(): + if header_row: + row = ['id'] + header + writer.writerow(row) + header_row = False + + if id_name == 0: + continue + row = [id_name] + row.extend(fields) + writer.writerow(row) + +if __name__ == '__main__': + parser = OptionParser() + parser.add_option("-d", "--delimiter", dest="delimiter", + help="the delimiter (default: tab)", default='\t') + parser.add_option("-o", "--out-file", dest="out_file", + help="output filename (default: counts.txt)", + default='counts.txt', action="store", type="string") + parser.add_option("-u", "--extra-output", dest="extra_out", + help="output extra information on non-unique and unique mappers (default: False)", + default=False, action="store_true") + parser.add_option("-b", "--bam", dest="bam", + help="all input files are BAM (default: False)", + default=False, action="store_true") + parser.add_option("-r", "--use-all-references", dest="use_all_references", + help="Use all the references from the SAM header (default: True)", + default=True, action="store_false") + parser.add_option("-f", "--extra-out-files", dest="extra_out_files", + help="comma-delimited filenames of unique and non-unique output " + "(default: unique.txt,nonunique.txt)", + default='unique.txt,nonunique.txt', action="store", type="string") + parser.add_option("-v", "--verbose", dest="verbose", + help="enable verbose output") + parser.add_option("-l", "--columns-labels", dest="col_labels", help="comma-delimited label names for samples", action="store", type="string") + + (options, args) = parser.parse_args() + + if len(args) < 1: + parser.error("one or more SAM files as arguments required") + + file_counts = dict() + file_unique_counts = dict() + file_nonunique_counts = dict() + all_ids = list() + files = [path.basename(f) for f in args] + + if options.col_labels and len(files) != len(options.col_labels.split(',')): + parser.error("Number of sample names does not equal number of files") + + if len(set(files)) != len(set(args)): + parser.error("file args must have unique base names (i.e. no foo/bar joo/bar)") + + ## do a pre-run check that all files exist + for full_filename in args: + if not path.exists(full_filename): + parser.error("file '%s' does not exist" % full_filename) + + for full_filename in args: + filename = path.basename(full_filename) + ## read in SAM file, extract counts, and unpack counts + tmp = SAM_file_to_counts(full_filename, bam=options.bam, extra=options.extra_out, + use_all_references=options.use_all_references) + + if options.extra_out: + counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique'] + else: + counts = tmp['counts'] + + ## save counts, and unique/non-unique counts + file_counts[filename] = counts + + if options.extra_out: + file_unique_counts[filename] = unique + file_nonunique_counts[filename] = nonunique + + ## add all ids encountered in this in this file + all_ids.extend(file_counts[filename].keys()) + + ## Uniquify all_ids, and then take the nested file_counts + ## dictionary, collapse, and write to file. + all_ids = set(all_ids) + table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=files) + counts_to_file(table_dict, options.out_file, delimiter=options.delimiter, labels=options.col_labels) + + if options.extra_out: + unique_fn, nonunique_fn = options.extra_out_files.split(',') + unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files) + nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files) + + counts_to_file(unique_table_dict, unique_fn, delimiter=options.delimiter) + counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=options.delimiter) + diff -r 669899d20e59 -r a49aff09553e deseq/stderr_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq/stderr_wrapper.py Wed Jan 09 18:39:12 2013 -0500 @@ -0,0 +1,57 @@ +#!/usr/bin/python + +""" +Wrapper that executes a program with its arguments but reports standard error +messages only if the program exit status was not 0. This is useful to prevent +Galaxy to interpret that there was an error if something was printed on stderr, +e.g. if this was simply a warning. +Example: ./stderr_wrapper.py myprog arg1 -f arg2 +Author: Florent Angly +""" + +import sys, subprocess + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + # Get command-line arguments + args = sys.argv + # Remove name of calling program, i.e. ./stderr_wrapper.py + args.pop(0) + # If there are no arguments left, we're done + if len(args) == 0: + return + + # If one needs to silence stdout + args.append( ">" ) + args.append( "/dev/null" ) + + #cmdline = " ".join(args) + #print cmdline + try: + # Run program + proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE ) + returncode = proc.wait() + # Capture stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + # Running Grinder failed: write error message to stderr + if returncode != 0: + raise Exception, stderr + except Exception, e: + # Running Grinder failed: write error message to stderr + stop_err( 'Error: ' + str( e ) ) + + +if __name__ == "__main__": __main__()