view msi_combine.xml @ 7:19d8eee15959 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/msi_combine commit 5feaf3d0e0da8cef1241fecc1f4d6f81324594e6
author galaxyp
date Wed, 22 Aug 2018 13:41:16 -0400
parents f4aafc565aa3
children c6564ddf0744
line wrap: on
line source

<tool id="mass_spectrometry_imaging_combine" name="MSI combine" version="1.10.0.5">
    <description>
        combine several mass spectrometry imaging datasets into one
    </description>
    <requirements>
        <requirement type="package" version="1.10.0">bioconductor-cardinal</requirement>
        <requirement type="package" version="3.0.0">r-ggplot2</requirement>
    </requirements>
    <command detect_errors="exit_code">
    <![CDATA[
        #for $i, $infile in enumerate($infiles):
            #if $infile.ext == 'imzml'
                ln -s '${infile.extra_files_path}/imzml' infile_${i}.imzML &&
                ln -s '${infile.extra_files_path}/ibd' infile_${i}.ibd &&
            #elif $infile.ext == 'analyze75'
                ln -s '${infile.extra_files_path}/hdr' infile_${i}.hdr &&
                ln -s '${infile.extra_files_path}/img' infile_${i}.img &&
                ln -s '${infile.extra_files_path}/t2m' infile_${i}.t2m &&
            #else
                ln -s '$infile' infile_${i}.RData &&
            #end if
        #end for
        #for $i, $annotation_file in enumerate($annotation_files):
            ln -s '$annotation_file' annotation_file_${i}.tabular &&
        #end for

        cat '${msi_combine}' &&
        Rscript '${msi_combine}'

    ]]>
    </command>
    <configfiles>
        <configfile name="msi_combine"><![CDATA[

#import re

################ load libraries and some preparations #################

library(Cardinal)
library(ggplot2)

## read tabular file for xy_shift option

#if str( $combine_conditional.combine_method ) == 'xy_shifts':
    input_list = read.delim("$combine_conditional.coordinates_file", header = FALSE, 
    stringsAsFactors = FALSE)
#end if

## load RData and store with new variable name

loadRData <- function(fileName){
#loads an RData file, and returns it
load(fileName)
get(ls()[ls() != "fileName"])
}

## preparations for reading files one by one with for loop

pixel_vector = numeric()
x_shifts = 0
y_shifts = 0
max_y = numeric()
valid_dataset = logical()
#set $msidata = []
#set $pixelcoords = []
#set $num_infiles = len($infiles)
all_files = $num_infiles


############## reading files and changing pixel coordinates ###################

#for $i, $infile in enumerate($infiles):

    ## read MSI data

    #if $infile.ext == 'imzml'
        #if str($processed_cond.processed_file) == "processed":
            msidata_$i <- readImzML('infile_${i}', mass.accuracy=$processed_cond.accuracy, units.accuracy = "$processed_cond.units")
        #else
            msidata_$i <- readImzML('infile_${i}')
        #end if
    #elif $infile.ext == 'analyze75'
            msidata_$i <- readAnalyze('infile_${i}')
    #else
            msidata_$i = loadRData('infile_${i}.RData')
    #end if


    ## read annotation data, up to 5 annotations can be used for now

    ## read annotation tabular, set first two columns as x and y, merge with coordinates dataframe and order according to pixelorder in msidata
    input_annotation = read.delim("annotation_file_${i}.tabular", header = TRUE, 
        stringsAsFactors = FALSE)

    if (class(input_annotation[,1]) == "character"){
        annotation_coordinates = matrix(unlist(strsplit(as.character(input_annotation[,1]), "\\,")), ncol=2, byrow=TRUE)
        annotation_coordinates2 = cbind(as.numeric(substring(annotation_coordinates[,1], 5, last = 1000000L)), as.numeric(substring(annotation_coordinates[,2], 5, last = 1000000L)))
        input_annotation = cbind(annotation_coordinates2, input_annotation[,-1])
    }

    colnames(input_annotation)[1:2] = c("x", "y")
    msidata_coordinates = cbind(coord(msidata_$i)[,1:2], 1:ncol(msidata_$i))
    colnames(msidata_coordinates)[3] = "pixel_index"
    ## only first 5 annotation columns are kept
    if (ncol(input_annotation) > 7){
        input_annotation = input_annotation[,1:7]}

    annotation_df = merge(msidata_coordinates, input_annotation, by=c("x", "y"), all.x=TRUE)
    annotation_df_8 = cbind(annotation_df, data.frame(matrix(NA,ncol=8-ncol(annotation_df), nrow=ncol(msidata_$i)))) 
    annotation_df_8_sorted = annotation_df_8[order(annotation_df_8\$pixel_index),]## orders pixel according to msidata

    ## each annotation column is assigned to the pixel in the pData slot of the MSIdata
    msidata_$i\$column1 = annotation_df_8_sorted[,4]
    msidata_$i\$column2 = annotation_df_8_sorted[,5]
    msidata_$i\$column3 = annotation_df_8_sorted[,6]
    msidata_$i\$column4 = annotation_df_8_sorted[,7]
    msidata_$i\$column5 = annotation_df_8_sorted[,8]

    ## same name for MSI data files necessary to combine data in one single coordinate system
    sampleNames(msidata_$i) = "msidata"

    ################### preparation xy shifts ##########################

    #if str( $combine_conditional.combine_method ) == 'xy_shifts':

        ## shift coordinates according to input tabular file and store file names
        coord(msidata_$i)\$x = coord(msidata_$i)\$x + input_list[$i+1,$combine_conditional.column_x] ## shifts x coordinates according to tabular file
        coord(msidata_$i)\$y = coord(msidata_$i)\$y + input_list[$i+1,$combine_conditional.column_y] ## shifts y coordinates according to tabular file
        pixel_vector = append(pixel_vector, rep(paste($i+1, input_list[$i+1,$combine_conditional.column_names], sep="_"),times=ncol(msidata_$i))) ## stores file name for each pixel
        msidata_$i\$combined_sample = rep(paste($i+1, input_list[$i+1,$combine_conditional.column_names], sep="_"),times=ncol(msidata_$i))
        pixelcoords_$i = cbind(coord(msidata_$i)[,1:2], rep($i+1,ncol(msidata_$i)))
        #silent $pixelcoords.append('pixelcoords_'+str($i))
        colnames(pixelcoords_$i)[3] = "file_number"

    ################### preparation automatic combination ##########################

    #elif str( $combine_conditional.combine_method ) == 'automatic_combine':
        names_vector = character()
        #set escaped_element_identifier = re.sub('[^\w\-\s\[/]]', '_', str($infile.element_identifier)) ## use name of inputfile from Galaxy
        if (sum(spectra(msidata_$i)[],na.rm=TRUE)>0) ## use only valid files
        {
            if (is.null(levels(msidata_$i\$combined_sample)))
            {
            names_vector = append(names_vector, rep(paste($i+1, "$escaped_element_identifier", sep="_"),ncol(msidata_$i)))
            msidata_$i\$combined_sample = as.factor(names_vector)
            }
        }

        ## Number of input files define grid which is row-wise filled with files

        coord(msidata_$i)\$x = coord(msidata_$i)\$x - (min(coord(msidata_$i)\$x-1)) + x_shifts
        coord(msidata_$i)\$y = coord(msidata_$i)\$y - (min(coord(msidata_$i)\$y-1)) + y_shifts
        x_shifts = max(coord(msidata_$i)\$x) + 5
        max_y = append(max_y, max(coord(msidata_$i)\$y))
        all_files = $num_infiles
        new_row = ($i+1)/ceiling(sqrt(all_files))
        new_row%%1==0
        if (new_row%%1==0)
        {x_shifts = 0 ### when row is filled: x values start again at zero
         y_shifts = max(max_y) + 5 ### when row is filled: y value increases to start a new row
        max_y = numeric()}

    #end if

    ## store files to combine them later and for each file check if it is valid

    #silent $msidata.append('msidata_'+str($i))
    valid_dataset = append(valid_dataset, 
         (ncol(msidata_$i)>0 & nrow(msidata_$i)>0 & sum(spectra(msidata_$i)[], na.rm=TRUE)>0))

#end for

## extract columnnames from (last) annotation tabular (for QC plot names)
annotation_colnames = colnames(input_annotation)[-c(1,2)]

###################### automatic combination ###################################
################################################################################

#if str( $combine_conditional.combine_method ) == 'automatic_combine':
    print("automatic_combine")

    ## combine only valid datasets

    valid_data =  list(#echo ','.join($msidata)#)[valid_dataset]
    msidata_combined = do.call(combine, valid_data)
    print("Valid datasets in order of input bottom to top:")
    print(valid_dataset)

    ## create dataframe with x,y,sample_name and show all pixels in PDF as QC

    pdf("Combined_qc.pdf", width=15, height=15)
    position_df = cbind(coord(msidata_combined)[,1:2], msidata_combined\$combined_sample)
    colnames(position_df)[3] = "sample_name"
    combine_plot = ggplot(position_df, aes(x=x, y=y, fill=sample_name))+
           geom_tile() +
           coord_fixed()+
           ggtitle("Spatial orientation of combined data (sample names)")+
           theme_bw()+
           theme(text=element_text(family="ArialMT", face="bold", size=15))+
           theme(legend.position="bottom",legend.direction="vertical")+
           guides(fill=guide_legend(ncol=4,byrow=TRUE))
    coord_labels = aggregate(cbind(x,y)~sample_name, data=position_df, mean, na.rm=TRUE, na.action="na.pass")
    coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$sample_name)
    for(file_count in 1:nrow(coord_labels))
    {combine_plot = combine_plot + annotate("text",x=coord_labels[file_count,"x"],
    y=coord_labels[file_count,"y"],label=toString(coord_labels[file_count,4]))}
    print(combine_plot)
    dev.off()

     ## save as (.RData)

    msidata = msidata_combined
    save(msidata, file="$msidata_combined")


################################## xy shifts ###################################
################################################################################

#elif str( $combine_conditional.combine_method ) == 'xy_shifts':
    print("xy_shifts")

    ## find duplicated coordinates
    all_coordinates = do.call(rbind, list(#echo ','.join($pixelcoords)#))
    duplicated_coordinates= duplicated(all_coordinates[,1:2])| duplicated(all_coordinates[,1:2], fromLast=TRUE)
print(paste0("Number of removed duplicated coordinates: ", sum(duplicated_coordinates)/2))
    unique_coordinates = all_coordinates[!duplicated_coordinates,]

    ## remove duplicated coordinates
    datasetlist = list()
    count = 1
    for (usable_dataset in list(#echo ','.join($msidata)#)){
        pixelsofinterest = pixels(usable_dataset)[names(pixels(usable_dataset)) %in% rownames(unique_coordinates)]
        filtered_dataset = usable_dataset[,pixelsofinterest]
        if (ncol(filtered_dataset) > 0 ){
            datasetlist[[count]] = filtered_dataset}
        count = count +1}

    msidata_combined = do.call(combine, datasetlist)

    ## save as (.RData)

    msidata = msidata_combined
    save(msidata, file="$msidata_combined")

    ## create x,y,sample_name dataframe for QC pdf

    position_df = cbind(coord(msidata), msidata\$combined_sample)
    colnames(position_df)[3] = "sample_name"

#end if


################################## outputs ####################################
################################################################################

########### QC with pixels and their annotations ################################

pdf("Combined_qc.pdf", width=15, height=15)

## combined plot
combine_plot = ggplot(position_df, aes(x=x, y=y, fill=sample_name))+
       geom_tile() +
       coord_fixed()+
       ggtitle("Spatial orientation of combined data")+
       theme_bw()+
       theme(text=element_text(family="ArialMT", face="bold", size=15))+
       theme(legend.position="bottom",legend.direction="vertical")+
       guides(fill=guide_legend(ncol=5,byrow=TRUE))
coord_labels = aggregate(cbind(x,y)~sample_name, data=position_df, mean)
coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$sample_name)
for(file_count in 1:nrow(coord_labels))
{combine_plot = combine_plot + annotate("text",x=coord_labels[file_count,"x"],
y=coord_labels[file_count,"y"],label=toString(coord_labels[file_count,4]))}
print(combine_plot)


    ## annotation plots

    ## plot 1

    column1_df = cbind(coord(msidata), msidata\$column1)
    colnames(column1_df)[3] = "column1"

    if (sum(is.na(column1_df[3])) < nrow(column1_df)){
    column1_plot = ggplot(column1_df, aes(x=x, y=y, fill=column1))+
           geom_tile() +
           coord_fixed()+
           ggtitle(paste0(annotation_colnames[1]))+
           theme_bw()+
           theme(text=element_text(family="ArialMT", face="bold", size=15))+
           theme(legend.position="bottom",legend.direction="vertical")+
           guides(fill=guide_legend(ncol=5,byrow=TRUE, title=annotation_colnames[1]))
    print(column1_plot)}
    ##rename columnname for output tabular file
    colnames(column1_df)[3] = annotation_colnames[1]

    ## plot 2
    column2_df = cbind(coord(msidata), msidata\$column2)
    colnames(column2_df)[3] = "column2"

    if (sum(is.na(column2_df[3])) < nrow(column2_df)){
    column2_plot = ggplot(column2_df, aes(x=x, y=y, fill=column2))+
           geom_tile() +
           coord_fixed()+
           ggtitle(paste0(annotation_colnames[2]))+
           theme_bw()+
           theme(text=element_text(family="ArialMT", face="bold", size=15))+
           theme(legend.position="bottom",legend.direction="vertical")+
           guides(fill=guide_legend(ncol=5,byrow=TRUE, title=annotation_colnames[2]))
    print(column2_plot)}
    ##rename columnname for output tabular file
    colnames(column2_df)[3] = annotation_colnames[2]

    ## plot 3
    column3_df = cbind(coord(msidata), msidata\$column3)
    colnames(column3_df)[3] = "column3"
    if (sum(is.na(column3_df[3])) < nrow(column3_df)){
    column3_plot = ggplot(column3_df, aes(x=x, y=y, fill=column3))+
           geom_tile() +
           coord_fixed()+
           ggtitle(paste0(annotation_colnames[3]))+
           theme_bw()+
           theme(text=element_text(family="ArialMT", face="bold", size=15))+
           theme(legend.position="bottom",legend.direction="vertical")+
           guides(fill=guide_legend(ncol=5,byrow=TRUE, title=annotation_colnames[3]))
    print(column3_plot)}
    ##rename columnname for output tabular file
    colnames(column3_df)[3] = annotation_colnames[3]

    ## plot 4
    column4_df = cbind(coord(msidata), msidata\$column4)
    colnames(column4_df)[3] = "column4"

    if (sum(is.na(column4_df[3])) < nrow(column4_df)){
    column4_plot = ggplot(column4_df, aes(x=x, y=y, fill=column4))+
           geom_tile() +
           coord_fixed()+
           ggtitle(paste0(annotation_colnames[4]))+
           theme_bw()+
           theme(text=element_text(family="ArialMT", face="bold", size=15))+
           theme(legend.position="bottom",legend.direction="vertical")+
           guides(fill=guide_legend(ncol=5,byrow=TRUE, title=annotation_colnames[4]))
    print(column4_plot)}
    ##rename columnname for output tabular file
    colnames(column4_df)[3] = annotation_colnames[4]

    ## plot5

    column5_df = cbind(coord(msidata), msidata\$column5)
    colnames(column5_df)[3] = "column5"
    if (sum(is.na(column5_df[3])) < nrow(column5_df)){
    column5_plot = ggplot(column5_df, aes(x=x, y=y, fill=column5))+
           geom_tile() +
           coord_fixed()+
           ggtitle(paste0(annotation_colnames[5]))+
           theme_bw()+
           theme(text=element_text(family="ArialMT", face="bold", size=15))+
           theme(legend.position="bottom",legend.direction="vertical")+
           guides(fill=guide_legend(ncol=5,byrow=TRUE, title=annotation_colnames[5]))
    print(column5_plot)}
    ##rename columnname for output tabular file
    colnames(column5_df)[3] = annotation_colnames[5]

    dev.off()

##################### annotation tabular output ################################

    if (length(features(msidata))> 0 & length(pixels(msidata)) > 0){
        annotation_df_list = list(position_df, column1_df, column2_df, column3_df, column4_df, column5_df)
        combined_annotations = Reduce(function(...) merge(..., by=c("x", "y"), all=TRUE), annotation_df_list)
        write.table(combined_annotations, file="$annotation_output", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
    }else{
        print("No annotation tabular output because file has no features or pixels left")
    }

    ####################### optional matrix output #################################

    #if $output_matrix:

        if (length(features(msidata))> 0 & length(pixels(msidata)) > 0){
            spectramatrix = spectra(msidata)[]
            spectramatrix = cbind(mz(msidata),spectramatrix)
            newmatrix = rbind(c("mz | spectra", names(pixels(msidata))), spectramatrix)
            write.table(newmatrix, file="$matrixasoutput", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
        }else{
            print("No intensity matrix output because file has no features or pixels left")
        }

    #end if

    ]]></configfile>
    </configfiles>
    <inputs>
        <param name="infiles" type="data" multiple="true" format="imzml,rdata,analyze75"
            label="MSI data as imzml, analyze7.5 or Cardinal MSImageSet saved as RData"
            help="load imzml and ibd file by uploading composite datatype imzml"/>
        <conditional name="processed_cond">
            <param name="processed_file" type="select" label="Is the input file a processed imzML file ">
                <option value="no_processed" selected="True">not a processed imzML</option>
                <option value="processed">processed imzML</option>
            </param>
            <when value="no_processed"/>
            <when value="processed">
                <param name="accuracy" type="float" value="50" label="Mass accuracy to which the m/z values will be binned" help="This should be set to the native accuracy of the mass spectrometer, if known"/>
                <param name="units" display="radio" type="select" label="Unit of the mass accuracy" help="either m/z or ppm">
                    <option value="mz" >mz</option>
                    <option value="ppm" selected="True" >ppm</option>
                </param>
            </when>
        </conditional>
        <param name="annotation_files" type="data" multiple="true" format="tabular"
            label="Pixel annotations as tabular files, same number and order of files as input files"
            help="Coordinates in column 1 in format x = 1, y = 1 or first column x values, second column y values. Up to 5 columns with pixel annotations"/>
        <conditional name="combine_conditional">
            <param name="combine_method" type="select" label="Select the way you want to combine multiple files" help="More detailed help can be found in the help section at the bottom">
                <option value="automatic_combine" selected="True" >automatic combination</option>
                <option value="xy_shifts">shift xy coordinates with a tabular file</option>
            </param>
            <when value="automatic_combine"/>
            <when value="xy_shifts">
                <param name="coordinates_file" type="data" format="tabular" label="datasetnames, x and y values to shift pixel coordinates before combining"
            help="Tabular file with three columns: 1 for the filename, 1 for the x-coordinate shift and 1 for the y-coordinate shift. Pixels with the same coordinates after shifting will be deleted."/>
                <param name="column_x" data_ref="coordinates_file" label="Column with values for shift in x direction" type="data_column"/>
                <param name="column_y" data_ref="coordinates_file" label="Column with values for shift in y direction" type="data_column"/>
                <param name="column_names" data_ref="coordinates_file" label="Column with dataset names" type="data_column"/>
            </when>
        </conditional>
    <param name="output_matrix" type="boolean" display="radio" label="Intensity matrix output"/>
    </inputs>
    <outputs>
        <data format="rdata" name="msidata_combined" label="MSI_data_combined"/>
        <data format="pdf" name="combining_qc" from_work_dir="Combined_qc.pdf" label = "Combined_QC"/>
        <data format="tabular" name="annotation_output" label="Annotation_tabular"/>
        <data format="tabular" name="matrixasoutput" label="Combined_matrix">
            <filter>output_matrix</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="4">
            <param name="infiles" value="msidata_1.RData,msidata_2.RData,msidata_3.RData" ftype="rdata"/>
            <param name="annotation_files" value="annotations_file1.tabular,annotations_file2.tabular,annotations_file3.tabular" ftype="tabular"/>
            <param name="combine_method" value="xy_shifts"/>
            <param name="coordinates_file" ftype="tabular" value="xy_coordinates.tabular"/>
            <param name="column_x" value="1"/>
            <param name="column_y" value="2"/>
            <param name="column_names" value="3"/>
            <param name="output_matrix" value="True"/>
            <output name="matrixasoutput" file="123_combined_matrix.tabular"/>
            <output name="annotation_output" file="123_annotation_output.tabular"/>
            <output name="msidata_combined" file="123_combined.RData" compare="sim_size" />
            <output name="combining_qc" file="123_combined_QC.pdf" compare="sim_size" delta="2000"/>
        </test>
        <test expect_num_outputs="4">
            <param name="infiles" value="msidata_1.RData,msidata_2.RData" ftype="rdata"/>
            <param name="annotation_files" value="annotations_file1.tabular,annotations_file2.tabular" ftype="tabular"/>
            <param name="combine_method" value="automatic_combine"/>
            <param name="output_matrix" value="True"/>
            <output name="matrixasoutput" file="12_combined_matrix.tabular"/>
            <output name="annotation_output" file="12_annotation_output.tabular"/>
            <output name="msidata_combined" file="12_combined.RData" compare="sim_size" />
            <output name="combining_qc" file="12_combined_QC.pdf" compare="sim_size" delta="2000"/>
        </test>
        <test expect_num_outputs="3">
            <param name="infiles" value="msidata_1.RData,123_combined.RData" ftype="rdata"/>
            <param name="annotation_files" value="annotations_file1.tabular,123_annotation.tabular" ftype="tabular"/>
            <param name="combine_method" value="automatic_combine"/>
            <param name="output_matrix" value="False"/>
            <output name="annotation_output" file="112_annotation_output.tabular"/>
            <output name="msidata_combined" file="112_auto_combined.RData" compare="sim_size" />
            <output name="combining_qc" file="112_auto_combined_QC.pdf" compare="sim_size" delta="2000"/>
        </test>
    </tests>
    <help>
<![CDATA[

Cardinal is an R package that implements statistical & computational tools for analyzing mass spectrometry imaging datasets. `More information on Cardinal <http://cardinalmsi.org//>`_

This tool uses the Cardinal combine function to combine several mass spectrometry imaging data. 

Input data: 3 types of input data can be used:

- imzml file (upload imzml and ibd file via the "composite" function) `Introduction to the imzml format <https://ms-imaging.org/wp/imzml/>`_
- Analyze7.5 (upload hdr, img and t2m file via the "composite" function)
- Cardinal "MSImageSet" data (with variable name "msidata", saved as .RData)


Input: 

- MSI data files with same m/z values (to obtain same m/z values for different files: filtering tool same m/z range and preprocessing tool same binning width)
- Tabular files with pixel annotations need to have the x values in the first column, y values in the second column and then up to five annotations in the next columns. The order of the annotations in the columns must be the same for all files (x and y in column 1 and 2; annotation1 in column3, annotation2 in column4,...)
- The order and the number of MSI data files and annotation tabular files must be the same
- For xy shifts with tabular file: Tabular file with x and y coordinates shift and file name (see below)

Options: 

- "automatic combination": files are automatically arranged in a grid (duplicated pixels are allowed), subfiles are named according to the input file name
- "xy shifts by hand": each file can be moved in x and y direction according to the users need (define one tabular file in the order in which the files are loaded in the history (bottom to top) and define for each file the x and y coordinates shifts in separate columns and the file name in a third column). The xy shift option combines all datasets and removes all duplicated pixels (same x and y coordinates).


Output: 

- single imzML file
- pdf that shows the pixel positions and annotations of the combined files
- Tabular file with pixel annotations (x,y,column with input file names, up to five annotation columns)
- optional: intensity matrix as tabular file (intensities for m/z in rows and pixel in columns)


]]>
    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv146</citation>
    </citations>
</tool>