view msi_filtering.xml @ 2:22db5eb94e50 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/msi_filtering commit 1c808d60243bb1eeda0cd26cb4b0a17ab05de2c0
author galaxyp
date Mon, 28 May 2018 12:36:24 -0400
parents 98c101b19f3c
children d51c3c814d57
line wrap: on
line source

<tool id="mass_spectrometry_imaging_filtering" name="MSI filtering" version="1.10.0.0">
    <description>tool for filtering mass spectrometry imaging data</description>
    <requirements>
        <requirement type="package" version="1.10.0">bioconductor-cardinal</requirement>
        <requirement type="package" version="2.2.1">r-gridextra</requirement>
    </requirements>
    <command detect_errors="exit_code">
    <![CDATA[

        #if $infile.ext == 'imzml'
            ln -s '${infile.extra_files_path}/imzml' infile.imzML &&
            ln -s '${infile.extra_files_path}/ibd' infile.ibd &&
        #elif $infile.ext == 'analyze75'
            ln -s '${infile.extra_files_path}/hdr' infile.hdr &&
            ln -s '${infile.extra_files_path}/img' infile.img &&
            ln -s '${infile.extra_files_path}/t2m' infile.t2m &&
        #else
            ln -s $infile infile.RData &&
        #end if
        cat '${MSI_subsetting}' &&
        echo ${MSI_subsetting} &&
        Rscript '${MSI_subsetting}'

    ]]>
    </command>
    <configfiles>
        <configfile name="MSI_subsetting"><![CDATA[


################################# load libraries and read file #################


library(Cardinal)
library(gridExtra)

#if $infile.ext == 'imzml'
    msidata = readImzML('infile')
#elif $infile.ext == 'analyze75'
    msidata = readAnalyze('infile')
#else
    load('infile.RData')
#end if

##################################### QC: inputfile properties in numbers ######

#if $outputs.outputs_select == "quality_control":
    ## Number of features (mz)
    maxfeatures = length(features(msidata))
    ## Range mz
    minmz = round(min(mz(msidata)), digits=2)
    maxmz = round(max(mz(msidata)), digits=2)
    ## Number of spectra (pixels)
    pixelcount = length(pixels(msidata))
    ## Range x coordinates
    minimumx = min(coord(msidata)[,1])
    maximumx = max(coord(msidata)[,1])
    ## Range y coordinates
    minimumy = min(coord(msidata)[,2])
    maximumy = max(coord(msidata)[,2])
    ## Number of intensities > 0
    npeaks= sum(spectra(msidata)[]>0)
    ## Spectra multiplied with mz (potential number of peaks)
    numpeaks = ncol(spectra(msidata)[])*nrow(spectra(msidata)[])
    ## Percentage of intensities > 0
    percpeaks = round(npeaks/numpeaks*100, digits=2)
    ## Number of empty TICs
    TICs = colSums(spectra(msidata)[])
    NumemptyTIC = sum(TICs == 0)
    ## median TIC
    medint = round(median(TICs), digits=2)
    ## Store features for QC plot
    featuresinfile = mz(msidata)
#end if


###################################### Filtering of pixels #####################

### Pixels in the one column format "x=,y="

#if str($pixels_cond.pixel_filtering) == "single_column":
    print("single column")

        input_list = read.delim("$pixels_cond.single_pixels", header = FALSE, stringsAsFactors = FALSE)
        numberpixels = length(input_list[,$pixels_cond.pixel_column])
        valid_entries = input_list[,$pixels_cond.pixel_column] %in% names(pixels(msidata))
        validpixels = sum(valid_entries)

                if (validpixels != 0)
            {
                pixelsofinterest = pixels(msidata)[names(pixels(msidata)) %in% input_list[valid_entries,$pixels_cond.pixel_column]]
                msidata = msidata[,pixelsofinterest]
            }else{
                msidata = msidata[,0]
                validpixels=0
            }


### Pixels in two columns format: x and y in different columns

#elif str($pixels_cond.pixel_filtering) == "two_columns":
    print("two columns")

        input_list = read.delim("$pixels_cond.two_columns_pixel", header = FALSE, 
        stringsAsFactors = FALSE)
        numberpixels = length(input_list[,$pixels_cond.pixel_column_x])

        inputpixel_x = input_list[,$pixels_cond.pixel_column_x]
        inputpixel_y = input_list[,$pixels_cond.pixel_column_y]

        inputpixels = cbind(inputpixel_x, inputpixel_y)
        colnames(inputpixels) = c("x", "y")
        valid_rows = merge(inputpixels, coord(msidata)[,1:2])
        validpixels = nrow(valid_rows)

        if (validpixels != 0)
        {

            pixelvector = character()

            for (pixel in 1:nrow(valid_rows))
            {
            pixelvector[pixel] = paste0("x = ", valid_rows[pixel,1],", ", "y = ", valid_rows[pixel,2])
            }

            pixelsofinterest= pixels(msidata)[names(pixels(msidata)) %in% pixelvector]
            msidata = msidata[,pixelsofinterest]
        }else{
            validpixels=0
        }


### Pixels wihin x and y minima and maxima are kept: 

#elif str($pixels_cond.pixel_filtering) == "pixel_range":
    print("pixel range")

    numberpixels = "range"
    validpixels = "range"

## only filter pixels if at least one pixel will be left

    if (sum(coord(msidata)\$x <= $pixels_cond.max_x_range & coord(msidata)\$x >= $pixels_cond.min_x_range) > 0 & sum(coord(msidata)\$y <= $pixels_cond.max_y_range & coord(msidata)\$y >= $pixels_cond.min_y_range) > 0)
    {
        msidata = msidata[, coord(msidata)\$x <= $pixels_cond.max_x_range & coord(msidata)\$x >= $pixels_cond.min_x_range]
        msidata = msidata[, coord(msidata)\$y <= $pixels_cond.max_y_range & coord(msidata)\$y >= $pixels_cond.min_y_range]
    }else{ 
        msidata = msidata[,0]
        print("no valid pixel found")
    }



#elif str($pixels_cond.pixel_filtering) == "none":
    print("no pixel filtering")
    numberpixels = 0
    validpixels = 0

#end if



###################################### filtering of features ######################

### Tabular file contains mz either as numbers or in the format mz=800.01

#if str($features_cond.features_filtering) == "features_list":
    print("feature list")

    input_features = read.delim("$inputfeatures", header = FALSE, stringsAsFactors = FALSE)

    startingrow = $features_cond.feature_header+1
    extracted_features = input_features[startingrow:nrow(input_features),$features_cond.feature_column]
    numberfeatures = length(extracted_features)

    if (grepl("m/z = ", input_features[startingrow,$features_cond.feature_column])==FALSE)

### if input is in numeric format
    {

        if (class(extracted_features) == "numeric")
        {
            charactervector = rep("m/z = ", numberfeatures)
            mz_added = paste0(charactervector, round(extracted_features,digits=2))
            validfeatures = mz_added %in% names(features(msidata))
            featuresofinterest = features(msidata)[names(features(msidata)) %in% mz_added[validfeatures]]
            validmz = sum(validfeatures)
        }else{
            validmz = 0
            featuresofinterest = 0
        }

### if input is already in character format (m/z = 800.01)

    }else{   
        validfeatures = extracted_features %in% names(features(msidata))
        featuresofinterest = features(msidata)[names(features(msidata)) %in% extracted_features[validfeatures]]
        validmz = sum(validfeatures)
    }

### filter msidata for valid features
    msidata = msidata[featuresofinterest,]


### Only features within a given minimum and maximum value are kept: 

#elif str($features_cond.features_filtering) == "features_range":
    print("feature range")

    numberfeatures = "range"
    validmz = "range"

    if (sum(mz(msidata) >= $features_cond.min_mz & mz(msidata) <= $features_cond.max_mz)> 0)
    {
    msidata = msidata[mz(msidata) >= $features_cond.min_mz & mz(msidata) <= $features_cond.max_mz,]
    }else{ 
        msidata = msidata[0,]
        print("no valid mz range")
    }


#elif str($features_cond.features_filtering) == "none":

    print("no feature filtering")
    validmz = 0
    numberfeatures = 0
#end if



# save msidata as Rfile
save(msidata, file="$msidata_filtered")

###################################### outputfile properties in numbers ########

#if $outputs.outputs_select == "quality_control":

## Number of features (mz)
maxfeatures2 = length(features(msidata))
## Range mz
minmz2 = round(min(mz(msidata)), digits=2)
maxmz2 = round(max(mz(msidata)), digits=2)
## Number of spectra (pixels)
pixelcount2 = length(pixels(msidata))
## Range x coordinates
minimumx2 = min(coord(msidata)[,1])
maximumx2 = max(coord(msidata)[,1])
## Range y coordinates
minimumy2 = min(coord(msidata)[,2])
maximumy2 = max(coord(msidata)[,2])
## Number of intensities > 0
npeaks2= sum(spectra(msidata)[]>0)
## Spectra multiplied with mz (potential number of peaks)
numpeaks2 = ncol(spectra(msidata)[])*nrow(spectra(msidata)[])
## Percentage of intensities > 0
percpeaks2 = round(npeaks2/numpeaks2*100, digits=2)
## Number of empty TICs
TICs2 = colSums(spectra(msidata)[]) 
NumemptyTIC2 = sum(TICs2 == 0)
## median TIC
medint2 = round(median(TICs2), digits=2)


properties = c("Number of mz features",
               "Range of mz values [Da]",
               "Number of pixels", 
               "Range of x coordinates", 
               "Range of y coordinates",
               "Intensities > 0",
               "Median TIC per pixel",
               "Number of zero TICs", 
               "pixel overview", 
               "feature overview")

before = c(paste0(maxfeatures), 
           paste0(minmz, " - ", maxmz), 
           paste0(pixelcount), 
           paste0(minimumx, " - ", maximumx),  
           paste0(minimumy, " - ", maximumy), 
           paste0(percpeaks, " %"), 
           paste0(medint),
           paste0(NumemptyTIC), 
           paste0("input pixels: ", numberpixels),
           paste0("input mz: ", numberfeatures))

filtered = c(paste0(maxfeatures2), 
           paste0(minmz2, " - ", maxmz2), 
           paste0(pixelcount2), 
           paste0(minimumx2, " - ", maximumx2),  
           paste0(minimumy2, " - ", maximumy2), 
           paste0(percpeaks2, " %"), 
           paste0(medint2),
           paste0(NumemptyTIC2), 
           paste0("valid pixels: ", validpixels),
           paste0("valid mz: ", validmz))


property_df = data.frame(properties, before, filtered)



######################################## PDF QC ################################

    pdf("filtertool_QC.pdf", fonts = "Times", pointsize = 12)
    plot(0,type='n',axes=FALSE,ann=FALSE)

    title(main=paste0("Qualitycontrol of filtering tool for file: \n\n", "$infile.display_name"))


    
    grid.table(property_df, rows= NULL)

    ### heatmap image as visual pixel control

if (length(features(msidata))> 0 & length(pixels(msidata)) > 0)
{


    image(msidata, mz=$outputs.inputmz, plusminus = $outputs.plusminus_dalton, contrast.enhance = "none", 
          main= paste0($outputs.inputmz," ± ", $outputs.plusminus_dalton, " Da"), ylim = c(maximumy2+0.2*maximumy2,minimumy2-0.2*minimumy2))

    ### control features which are left

    plot(featuresinfile, rep(1,length(featuresinfile)), yaxt="n", ylab=NA, xlab="m/z values", col="red", ylim=c(0.8, 1.1), main="Distribution of m/z values")
    lines(mz(msidata),rep(0.9, length(mz(msidata))), col="green", type="p")
    legend("top", horiz=TRUE,
           legend = c("before", "filtered"), 
           fill = c("red", "green"))

}else{
    print("file has no features or pixels left")
}

    dev.off()

#end if

######################################## intensity matrix ######################

#if $output_matrix:

if (length(features(msidata))> 0 & length(pixels(msidata)) > 0)
{

    spectramatrix = spectra(msidata)
    rownames(spectramatrix) = mz(msidata)
    newmatrix = rbind(pixels(msidata), spectramatrix)
    write.table(newmatrix[2:nrow(newmatrix),], file="$matrixasoutput", quote = FALSE, row.names = TRUE, col.names=NA, sep = "\t")

}else{
    print("file has no features or pixels left")
}

#end if


    ]]></configfile>
    </configfiles>
    <inputs>
        <param name="infile" type="data" format="imzml, rdata, analyze75"
               label="Inputfile as imzML, Analyze7.5 or Cardinal MSImageSet saved as RData"
                help="Upload composite datatype imzML (ibd+imzML) or analyze75 (hdr+img+t2m) or regular upload .RData (Cardinal MSImageSet)"/>
        <conditional name="pixels_cond">
            <param name="pixel_filtering" type="select" label="Select pixel filtering option">
                <option value="none" selected="True">none</option>
                <option value="single_column">tabular file with single column (x = 1, y = 1)</option>
                <option value="two_columns">tabular file with separate columns for x and y values</option>
                <option value="pixel_range">ranges for x and y</option>
            </param>
            <when value="none"/>
            <when value="single_column">
                <param name="single_pixels" type="data" format="tabular" label="Pixels in single column for filtering of MSI data"
                    help="tabular file with pixels of interest in the form x = 1, y = 1"/>
                <param name="pixel_column" data_ref="single_pixels" label="Column with pixels" type="data_column"/>
            </when> 
            <when value="two_columns">
                <param name="two_columns_pixel" type="data" format="tabular" label="Pixels in two columns for filtering of MSI data"
                    help="tabular file with pixels of interest in two separate columns"/>
                <param name="pixel_column_x" data_ref="two_columns_pixel" label="Column with x values" type="data_column"/>
                <param name="pixel_column_y" data_ref="two_columns_pixel" label="Column with y values" type="data_column"/>
            </when> 
            <when value="pixel_range">
                <param name="min_x_range" type="integer" value="0" label="Minimum value for x"/>
                <param name="max_x_range" type="integer" value="100" label="Maximum value for x"/>
                <param name="min_y_range" type="integer" value="0" label="Minimum value for y"/>
                <param name="max_y_range" type="integer" value="100" label="Maximum value for y"/>
            </when> 
        </conditional>
        <conditional name="features_cond">
            <param name="features_filtering" type="select" label="Select feature filtering option">
                <option value="none" selected="True">none</option>
                <option value="features_list">tabular file with features (data type: 800.12 or m/z = 800.12)</option>
                <option value="features_range">range of features</option>
            </param>
            <when value="none"/>
            <when value="features_list">
                <param name="inputfeatures" type="data" format="tabular" label="Features for filtering of MSI data" help="tabular file with masses of interest either as numbers (800.05) or in the form m/z = 800.05"/>
                <param name="feature_column" data_ref="inputfeatures" label="Column with features" type="data_column"/>
                <param name="feature_header" label="Number of header lines to skip" value="0" type="integer"/>
            </when> 
            <when value="features_range">
                <param name="min_mz" type="float" value="1" label="Minimum value for mz (in Dalton)"/>
                <param name="max_mz" type="float" value="100" label="Maximum value for mz (in Dalton)"/>
            </when> 
        </conditional>
        <conditional name="outputs">
           <param name="outputs_select" type="select" label="Quality control output">
               <option value="quality_control" selected="True">yes</option>
               <option value="no_quality_control">no</option>
           </param>
           <when value="quality_control">
              <param name="inputmz" type="float" value="1296.7" label="Mass for which a heatmap image will be drawn" help="Use a mass which is still present in all pixels to control if the pixel filtering went well"/>
              <param name="plusminus_dalton" value="0.25" type="float" label="mass range for mz value" help="plusminus mass window in Dalton"/>
           </when>
           <when value="no_quality_control"/>
         </conditional>
         <param name="output_matrix" type="boolean" display="radio" label="Intensity matrix output"/>
    </inputs>
    <outputs>
        <data format="rdata" name="msidata_filtered" label="${tool.name} ${on_string}"/>
        <data format="pdf" name="filtering_qc" from_work_dir="filtertool_QC.pdf" label = "QC ${tool.name} ${on_string}">
            <filter>outputs["outputs_select"] == "quality_control"</filter>
        </data>
        <data format="tabular" name="matrixasoutput" label="Matrix ${tool.name} ${on_string}">
            <filter>output_matrix</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <param name="infile" value="" ftype="imzml">
                <composite_data value="Example_Continuous.imzML"/>
                <composite_data value="Example_Continuous.ibd"/>
            </param>
            <param name="pixel_filtering" value="single_column"/>
            <param name="single_pixels" ftype="tabular" value = "inputpixels.tabular"/>
            <param name="pixel_column" value="1"/>
            <param name="features_filtering" value="features_list"/>
            <param name="inputfeatures" ftype="tabular" value = "inputfeatures.tabular"/>
            <param name="feature_column" value="2"/>
            <param name="feature_header" value="1"/>
             <param name="outputs_select" value="quality_control"/>
                <param name="inputmz" value="328.9"/>
                <param name="plusminus_dalton" value="0.25"/>
            <output name="filtering_qc" file="imzml_filtered.pdf" compare="sim_size" delta="20000"/>
            <output name="msidata_filtered" file="imzml_filtered.RData" compare="sim_size" />
        </test>
        <test expect_num_outputs="2">
            <param name="infile" value="" ftype="imzml">
                <composite_data value="Example_Continuous.imzML"/>
                <composite_data value="Example_Continuous.ibd"/>
            </param>
            <param name="pixel_filtering" value="pixel_range"/>
            <param name="min_x_range" value="10"/>
            <param name="max_x_range" value="20"/>
            <param name="min_y_range" value="2"/>
            <param name="max_y_range" value="2"/>
             <param name="outputs_select" value="quality_control"/>
                <param name="inputmz" value="328.9"/>
                <param name="plusminus_dalton" value="0.25"/>
            <output name="filtering_qc" file="imzml_filtered2.pdf" compare="sim_size" delta="20000"/>
            <output name="msidata_filtered" file="imzml_filtered2.RData" compare="sim_size" />
        </test>
        <test expect_num_outputs="3">
            <param name="infile" value="" ftype="imzml">
                <composite_data value="Example_Continuous.imzML"/>
                <composite_data value="Example_Continuous.ibd"/>
            </param>
            <param name="pixel_filtering" value="pixel_range"/>
            <param name="min_x_range" value="1"/>
            <param name="max_x_range" value="20"/>
            <param name="min_y_range" value="2"/>
            <param name="max_y_range" value="2"/>
            <param name="features_filtering" value="features_range"/>
            <param name="min_mz" value="0" />
            <param name="max_mz" value="500"/>
             <param name="outputs_select" value="quality_control"/>
                <param name="inputmz" value="328.9"/>
                <param name="plusminus_dalton" value="0.25"/>
            <param name="output_matrix" value="True"/>
            <output name="filtering_qc" file="imzml_filtered3.pdf" compare="sim_size" delta="20000"/>
            <output name="msidata_filtered" file="imzml_filtered3.RData" compare="sim_size" />
            <output name="matrixasoutput" file="imzml_matrix3.tabular"/>
        </test>
        <test expect_num_outputs="2">
            <param name="infile" value="" ftype="imzml">
                <composite_data value="Example_Continuous.imzML"/>
                <composite_data value="Example_Continuous.ibd"/>
            </param>
            <param name="pixel_filtering" value="two_columns"/>
            <param name="two_columns_pixel" ftype="tabular" value = "inputpixels_2column.tabular"/>
            <param name="pixel_column_x" value="1"/>
            <param name="pixel_column_y" value="3"/>
            <param name="features_filtering" value="features_list"/>
            <param name="inputfeatures" ftype="tabular" value = "inputcalibrantfile2.txt"/>
            <param name="feature_column" value="1"/>
            <param name="feature_header" value="0"/>
             <param name="outputs_select" value="quality_control"/>
                <param name="inputmz" value="328.9"/>
                <param name="plusminus_dalton" value="0.25"/>
            <output name="filtering_qc" file="imzml_filtered4.pdf" compare="sim_size" delta="20000"/>
            <output name="msidata_filtered" file="imzml_filtered4.RData" compare="sim_size" />
        </test>
        <test expect_num_outputs="2">
            <param name="infile" value="" ftype="imzml">
                <composite_data value="Example_Continuous.imzML"/>
                <composite_data value="Example_Continuous.ibd"/>
            </param>
            <param name="pixel_filtering" value="pixel_range"/>
            <param name="min_x_range" value="0"/>
            <param name="max_x_range" value="10"/>
            <param name="min_y_range" value="2"/>
            <param name="max_y_range" value="20"/>
            <param name="features_filtering" value="features_range"/>
            <param name="min_mz" value="500" />
            <param name="max_mz" value="700"/>
             <param name="outputs_select" value="quality_control"/>
                <param name="inputmz" value="328.9"/>
                <param name="plusminus_dalton" value="0.25"/>
            <output name="filtering_qc" file="imzml_filtered5.pdf" compare="sim_size" delta="20000"/>
            <output name="msidata_filtered" file="imzml_filtered5.RData" compare="sim_size" />
        </test>
        <test expect_num_outputs="3">
           <param name="infile" value="" ftype="analyze75">
                <composite_data value="Analyze75.hdr"/>
                <composite_data value="Analyze75.img"/>
                <composite_data value="Analyze75.t2m"/>
            </param>
            <param name="pixel_filtering" value="single_column"/>
            <param name="single_pixels" ftype="tabular" value = "inputpixels2.tabular"/>
            <param name="pixel_column" value="1"/>
            <param name="features_filtering" value="features_list"/>
            <param name="inputfeatures" ftype="tabular" value = "featuresofinterest2.tabular"/>
            <param name="feature_column" value="1"/>
            <conditional name="outputs">
                <param name="outputs_select" value="quality_control"/>
                    <param name="inputmz" value="1200"/>
                    <param name="plusminus_dalton" value="0.25"/>
            </conditional>
            <param name="output_matrix" value="True"/>
            <output name="filtering_qc" file="analyze_filtered.pdf" compare="sim_size" delta="20000"/>
            <output name="msidata_filtered" file="analyze_filtered.RData" compare="sim_size" />
            <output name="matrixasoutput" file="analyze_matrix.tabular"/>
        </test>
        <test expect_num_outputs="2">
           <param name="infile" value="" ftype="analyze75">
                <composite_data value="Analyze75.hdr"/>
                <composite_data value="Analyze75.img"/>
                <composite_data value="Analyze75.t2m"/>
            </param>
            <conditional name="outputs">
                <param name="outputs_select" value="quality_control"/>
                    <param name="inputmz" value="1200"/>
                    <param name="plusminus_dalton" value="0.25"/>
            </conditional>
            <output name="filtering_qc" file="analyze75_filtered2.pdf" compare="sim_size" delta="20000"/>
            <output name="msidata_filtered" file="analyze_filteredoutside.RData" compare="sim_size" />
        </test>
        <test expect_num_outputs="2">
            <param name="infile" value="preprocessed.RData" ftype="rdata"/>
            <conditional name="outputs">
                <param name="outputs_select" value="no_quality_control"/>
            </conditional>
            <param name="output_matrix" value="True"/>
            <output name="matrixasoutput" file="rdata_matrix.tabular"/>
            <output name="msidata_filtered" file="rdata_notfiltered.RData" compare="sim_size" />
        </test>
    </tests>
    <help>
        <![CDATA[

Cardinal is an R package that implements statistical & computational tools for analyzing mass spectrometry imaging datasets. `More information on Cardinal <http://cardinalmsi.org//>`_

This tool provides provides options to filter (subset) pixels and masses of mass-spectrometry imaging data.

Input data: 3 types of input data can be used:

- imzml file (upload imzml and ibd file via the "composite" function) `Introduction to the imzml format <https://ms-imaging.org/wp/imzml/>`_
- Analyze7.5 (upload hdr, img and t2m file via the "composite" function)
- Cardinal "MSImageSet" data (with variable name "msidata", saved as .RData)

Options:

- pixel filtering: can use a tabular file containing x and y coordinates or by defining a range for x and y by hand
- mass filtering: can use a tabular file containing masses of interest or by defining a range for the mass values

Output: 

- imzML file filtered for pixels and/or masses
- optional: pdf with heatmap showing the pixels that are left after filtering and plot of masses before and after filtering
- optional: intensity matrix as tabular file (intensities for masses in rows and pixel in columns)

Tip: 

- It is recommended to use the filtering tool only for masses which have been extracted from the same dataset. If you have masses from dataset A and you want to use them to filter dataset B, first find the corresponding (closest) features in dataset B by using the tool "Join two files on column allowing a small difference". Afterwards use the corresponding feature masses from dataset A to filter dataset B. 


        ]]>
    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv146</citation>
    </citations>
</tool>