view data_exporter.xml @ 5:90d2e00e1304 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit 1df4591de435d232862f20669aea529ceb23b12a"
author galaxyp
date Fri, 13 Dec 2019 13:56:29 -0500
parents e521b5767819
children 350a84ea795c
line wrap: on
line source

<tool id="cardinal_data_exporter" name="MSI data exporter" version="@VERSION@.2">
    <description>
        exports imzML and Analyze7.5 to tabular files
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code">
    <![CDATA[

        @INPUT_LINKING@
        cat '${cardinal_imzml_exporter}' &&
        Rscript '${cardinal_imzml_exporter}'

    ]]>
    </command>
    <configfiles>
        <configfile name="cardinal_imzml_exporter"><![CDATA[

################################# load libraries and read file #################

library(Cardinal)

@READING_MSIDATA_INRAM@


    ###################### Intensity matrix output ################################

    #if "int_matrix" in str($output_options).split(","):
        print("intensity matrix output")

        mz_names = gsub(" = ", "_", names(features(msidata)))
        mz_names = gsub("/", "", mz_names)
        pixel_names = gsub(", y = ", "_", names(pixels(msidata)))
        pixel_names = gsub(" = ", "y_", pixel_names)

        ##spectramatrix = cbind(mz_names,spectra(msidata))
        newmatrix = rbind(c("mz_name", pixel_names), cbind(mz_names,spectra(msidata)))
        write.table(newmatrix, file="$intensity_matrix", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
        ## free up RAM space in case furhter steps will be run: 
        rm(newmatrix)
        gc()

    #end if


    ############################## m/z feature output ##########################
    #if "mz_tabular" in str($output_options).split(","):
        print("mz feature output")

        mz_names = gsub(" = ", "_", names(features(msidata)))
        mz_names = gsub("/", "", mz_names)

        ## mean, median, sd and SEM intensity per file and mz
        full_sample_mean = rowMeans(spectra(msidata), na.rm=TRUE)
        full_sample_median = apply(spectra(msidata),1,median, na.rm=TRUE)
        full_sample_sd = apply(spectra(msidata),1,sd, na.rm=TRUE)
        full_sample_sem = full_sample_sd/full_sample_mean*100
        ## npeaks and sum of all intensities per spectrum and mz
        npeaks= sum(spectra(msidata)>0, na.rm=TRUE)
        mzTIC = rowSums(spectra(msidata), na.rm=TRUE) ## calculate intensity sum for each m/z
        peakspermz = rowSums(spectra(msidata) > 0, na.rm=TRUE) ## calculate number of intensities > 0 for each m/z (max = number of spectra)

        ## combine into dataframe, order is the same for all vectors
        mz_df = data.frame(mz_names, mz(msidata), full_sample_mean, full_sample_median, full_sample_sd, full_sample_sem, mzTIC, peakspermz)
        colnames(mz_df) = c("mz_names", "mz", "sample_mean", "sample_median", "sample_sd", "sample_sem", "intensity_sum", "number_peaks")
        write.table(mz_df, file="$feature_output", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
        ## free up RAM space in case furhter steps will be run: 
        rm(mz_df)
        gc()
    #end if

    ###################### summarized m/z feature output #######################

    #if str($tabular_annotation.load_annotation) == 'yes_annotation':
        print("summarized annotation output")

        ## read and extract x,y,annotation information
        input_tabular = read.delim("$tabular_annotation.annotation_file", header = $tabular_annotation.tabular_header, stringsAsFactors = FALSE)
        annotation_input = input_tabular[,c($tabular_annotation.column_x, $tabular_annotation.column_y, $tabular_annotation.column_names)]
        colnames(annotation_input) = c("x", "y", "annotation")

        ## merge with coordinate information of msidata
        msidata_coordinates = cbind(coord(msidata)[,1:2], c(1:ncol(msidata)))
        colnames(msidata_coordinates)[3] = "pixel_index"
        merged_annotation = merge(msidata_coordinates, annotation_input, by=c("x", "y"), all.x=TRUE)
        merged_annotation[is.na(merged_annotation)] = "NA"
        merged_annotation = merged_annotation[order(merged_annotation\$pixel_index),]
        msidata\$annotation = as.factor(merged_annotation[,4])

        ## create m/z feature name
        mz_names = gsub(" = ", "_", names(features(msidata)))
        mz_names = gsub("/", "", mz_names)

        #if "mean" in str($tabular_annotation.summary_type).split(","):
            print("summarized mean")

            ## calculate mean per annotation group
            sample_matrix = matrix(,ncol=0, nrow=nrow(msidata))
            count = 1
            for (subsample in levels(msidata\$annotation)){
                subsample_pixels = msidata[,msidata\$annotation == subsample]
                subsample_calc = rowMeans(spectra(subsample_pixels), na.rm=TRUE)
                sample_matrix = cbind(sample_matrix, subsample_calc)
                count = count+1}
            sample_matrix_mean = cbind(mz_names,sample_matrix)
            sample_matrix_mean = rbind(c("mz_name", levels(msidata\$annotation)), sample_matrix_mean)
            write.table(sample_matrix_mean, file="$summarized_mean", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
        #end if

        #if "median" in str($tabular_annotation.summary_type).split(","):
            print("summarized median")

            sample_matrix = matrix(,ncol=0, nrow=nrow(msidata))
            count = 1
            for (subsample in levels(msidata\$annotation)){
                subsample_pixels = msidata[,msidata\$annotation == subsample]
                subsample_calc = apply(spectra(subsample_pixels),1,median, na.rm=TRUE)
                sample_matrix = cbind(sample_matrix, subsample_calc)
                count = count+1}
            sample_matrix_median = cbind(mz_names,sample_matrix)
            sample_matrix_median = rbind(c("mz name", levels(msidata\$annotation)), sample_matrix_median)
            write.table(sample_matrix_median, file="$summarized_median", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
        #end if

        #if "sd" in str($tabular_annotation.summary_type).split(","):
            print("summarized sd")

            sample_matrix = matrix(,ncol=0, nrow=nrow(msidata))
            count = 1
            for (subsample in levels(msidata\$annotation)){
                subsample_pixels = msidata[,msidata\$annotation == subsample]
                subsample_calc = apply(spectra(subsample_pixels),1,sd, na.rm=TRUE)
                sample_matrix = cbind(sample_matrix, subsample_calc)
                count = count+1}
            sample_matrix_sd = cbind(mz_names,sample_matrix)
            sample_matrix_sd = rbind(c("mz name", levels(msidata\$annotation)), sample_matrix_sd)
            write.table(sample_matrix_sd, file="$summarized_sd", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
        #end if

    #end if


    ############################ spectra (pixel) output ############################
    #if "pixel_tabular" in str($output_options).split(","):
        print("pixel output")

        ## coordinates
        xycoordinates = coord(msidata)[,1:2]

        ## pixel name
        pixel_names = paste0("xy_", xycoordinates\$x, "_", xycoordinates\$y)

        ## pixel order 
        pixelxyarray=1:length(pixels(msidata))

        ## number of pixels per spectrum: every intensity value > 0 counts as peak
        peaksperpixel = colSums(spectra(msidata)>0, na.rm=TRUE)

        ## Total ion chromatogram per spectrum
        TICs = round(colSums(spectra(msidata), na.rm=TRUE), digits = 2)

        ## Median ion intensity per spectrum
        med_int = round(apply(spectra(msidata), 2, median, na.rm=TRUE), digits = 2)

        ## Maximum ion intensity per spectrum
        max_int = round(apply(spectra(msidata), 2, max, na.rm=TRUE), digits = 2)

        ## Highest m/z per spectrum
        highestmz = apply(spectra(msidata),2,which.max) 
        highestmz_data = mz(msidata)[highestmz]

        ## Combine into dataframe; order is the same for all vectors
        spectra_df = data.frame(pixel_names, xycoordinates, pixelxyarray, peaksperpixel, med_int, TICs, max_int, highestmz_data)
        colnames(spectra_df) = c("spectra_names", "x_values", "y_values","pixel_order", "peaks_per_spectrum", "median_intensity", "spectrum_TIC", "maximum_intensity", "most_abundant_mz")

        #if str($counting_calibrants.pixel_with_calibrants) == "yes_calibrants":

            calibrant_list = read.delim("$counting_calibrants.mz_tabular", header = $counting_calibrants.feature_header, na.strings=c("","NA"), stringsAsFactors = FALSE)
            calibrant_list = calibrant_list[,$counting_calibrants.feature_column, drop=FALSE]
            ### calculate how many input calibrant m/z are valid: 
            inputcalibrants = calibrant_list[calibrant_list[,1]>min(mz(msidata)) & calibrant_list[,1]<max(mz(msidata)),,drop = FALSE]
            inputcalibrantmasses = inputcalibrants[,1]

            ##QC plot number 2) Number of calibrants per spectrum

            ## matrix with calibrants in columns and in rows if there is peak intensity in range or not
            pixelmatrix = matrix(ncol=ncol(msidata), nrow = 0)

            if (length(inputcalibrantmasses) != 0){

                ## calculate plusminus values in m/z for each calibrant
                plusminusvalues = rep($counting_calibrants.plusminus_ppm/1000000, length(inputcalibrantmasses))*inputcalibrantmasses

                ## filter for m/z window of each calibrant and calculate if sum of peak intensities > 0

                for (mass in 1:length(inputcalibrantmasses)){
                    filtered_data = msidata[mz(msidata) >= inputcalibrantmasses[mass]-plusminusvalues[mass] & mz(msidata) <= inputcalibrantmasses[mass]+plusminusvalues[mass],]
                    if (nrow(filtered_data) > 1 & sum(spectra(filtered_data),na.rm=TRUE) > 0){
                        ## intensity of all m/z > 0
                          intensity_sum = colSums(spectra(filtered_data), na.rm=TRUE) > 0

                    }else if(nrow(filtered_data) == 1 & sum(spectra(filtered_data), na.rm=TRUE) > 0){
                        ## intensity of only m/z > 0
                        intensity_sum = spectra(filtered_data) > 0 
                    }else{
                        intensity_sum = rep(FALSE, ncol(filtered_data))}
                    ## for each pixel add sum of intensities > 0 in the given m/z range
                    pixelmatrix = rbind(pixelmatrix, intensity_sum)
                }
                ## for each pixel count TRUE (each calibrant m/z range with intensity > 0 is TRUE)
                countvector= as.factor(apply(pixelmatrix, 2,sum,na.rm=TRUE))

            }else{countvector = rep(0,ncol(msidata))}
                countdf= cbind(coord(msidata)[,1:2], countvector) ## add pixel coordinates to counts
                colnames(countdf) = c("x_values", "y_values", "m/z count")
                spectra_df = merge(spectra_df, countdf, by=c("x_values", "y_values"))

                ## sort columns to have spectra_names as rowname in first column
                spectra_df = spectra_df[c("spectra_names", "x_values", "y_values","pixel_order", "peaks_per_spectrum", "median_intensity", "spectrum_TIC", "maximum_intensity", "most_abundant_mz", "m/z count")]

        #end if
        #if str($tabular_annotation.load_annotation) == 'yes_annotation':

            colnames(annotation_input) = c("x_values", "y_values", "annotation")
            spectra_df = merge(spectra_df,annotation_input, by=c("x_values", "y_values"), all.x=TRUE)

            ## sort columns to have spectra_names as rowname in first column
            #if str($counting_calibrants.pixel_with_calibrants) == "yes_calibrants":
                spectra_df = spectra_df[c("spectra_names", "x_values", "y_values","pixel_order", "peaks_per_spectrum", "median_intensity", "spectrum_TIC", "maximum_intensity", "most_abundant_mz", "m/z count", "annotation")]
            #else
                spectra_df = spectra_df[c("spectra_names", "x_values", "y_values","pixel_order", "peaks_per_spectrum", "median_intensity", "spectrum_TIC", "maximum_intensity", "most_abundant_mz", "annotation")]
            #end if

        #end if
        ## sort rows according to original pixel order
        spectra_df = spectra_df[match(pixel_names, spectra_df\$spectra_names),]

        ## Create list and output tabular
        write.table(spectra_df, file="$pixel_output", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
    #end if


    ]]></configfile>
    </configfiles>
    <inputs>
        <expand macro="reading_msidata"/>
        <param name="output_options" type="select" display="checkboxes" optional="False" multiple="true" label="Multiple output files can be selected">
            <option value="int_matrix" selected="True" >intensity matrix</option>
            <option value="mz_tabular">mz feature output</option>
            <option value="pixel_tabular">pixel output</option>
        </param>
        <conditional name="counting_calibrants">
            <param name="pixel_with_calibrants" type="select" label="Use file with m/z of interest to calculate their occurrence in each spectrum">
                <option value="no_calibrants" selected="True">no</option>
                <option value="yes_calibrants">yes</option>
            </param>
            <when value="no_calibrants"/>
            <when value="yes_calibrants">
                <expand macro="reading_1_column_mz_tabular" label="For each spectrum the occurrence of the provided m/z values is counted"/>
                <param name="plusminus_ppm" value="200" type="float" label="ppm range will be added in both directions to input m/z" help="The m/z window is used to search for peaks, if intensity > 0 found in the window the m/z is considered present, if all intensities are 0 the m/z is considered not present"/>
            </when>
        </conditional>
        <conditional name="tabular_annotation">
            <param name="load_annotation" type="select" label="Pixel annotation can be used to summarize intensities per annotation group">
                <option value="no_annotation" selected="True">no</option>
                <option value="yes_annotation">yes</option>
            </param>
            <when value="no_annotation"/>
            <when value="yes_annotation">
                <expand macro="reading_pixel_annotations"/>
                <param name="summary_type" type="select" display="checkboxes" optional="False" multiple="true" label="Calculation for each m/z and all pixels of a annotation group" help="This step will only work if pixel annotations are provided">
                    <option value="mean">mean</option>
                    <option value="median">median</option>
                    <option value="sd">standard deviation</option>
                </param>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="tabular" name="intensity_matrix" label="${tool.name} on ${on_string}: intensity_matrix">
            <filter>"int_matrix" in output_options</filter>
        </data>
        <data format="tabular" name="pixel_output" label="${tool.name} on ${on_string}: spectra">
            <filter>"pixel_tabular" in output_options</filter>
        </data>
        <data format="tabular" name="feature_output" label="${tool.name} on ${on_string}: features">
            <filter>"mz_tabular" in output_options</filter>
        </data>
        <data format="tabular" name="summarized_mean" label="${tool.name} on ${on_string}: group_mean">
            <filter>tabular_annotation['load_annotation'] == 'yes_annotation' and 'mean' in tabular_annotation['summary_type']</filter>
        </data>
        <data format="tabular" name="summarized_median" label="${tool.name} on ${on_string}: group_median">
            <filter>tabular_annotation['load_annotation'] == 'yes_annotation' and 'median' in tabular_annotation['summary_type']</filter>
        </data>
        <data format="tabular" name="summarized_sd" label="${tool.name} on ${on_string}: group_sd">
            <filter>tabular_annotation['load_annotation'] == 'yes_annotation' and 'sd' in tabular_annotation['summary_type']</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <expand macro="infile_imzml"/>
            <param name="output_options" value="int_matrix,mz_tabular"/>
            <output name="intensity_matrix" file="int_matrix1.tabular"/>
            <output name="feature_output" file="features_out1.tabular"/>
        </test>
        <test expect_num_outputs="3">
            <expand macro="infile_analyze75"/>
            <param name="output_options" value="pixel_tabular"/>
            <conditional name="tabular_annotation">
                <param name="load_annotation" value="yes_annotation"/>
                <param name="annotation_file" value="annotations.tabular"/>
                <param name="column_x" value="1"/>
                <param name="column_y" value="2"/>
                <param name="column_names" value="4"/>
                <param name="tabular_header" value="True"/>
                <param name="summary_type" value="mean,sd"/>
            </conditional>
            <output name="pixel_output" file="pixel_out2.tabular"/>
            <output name="summarized_mean" file="mean_out2.tabular"/>
            <output name="summarized_sd" file="sd_out2.tabular"/>
        </test>
        <test expect_num_outputs="3">
            <expand macro="infile_imzml"/>
            <param name="output_options" value="int_matrix,pixel_tabular,mz_tabular"/>
            <conditional name="counting_calibrants">
                <param name="pixel_with_calibrants" value="yes_calibrants"/>
                <param name="mz_tabular" value="inputcalibrantfile2.txt"/>
                <param name="feature_column" value="1"/>
                <param name="feature_header" value="False"/>
                <param name="plusminus_ppm" value="200"/>
            </conditional>
            <output name="intensity_matrix" file="int_matrix3.tabular"/>
            <output name="feature_output" file="features_out3.tabular"/>
            <output name="pixel_output" file="pixel_out3.tabular"/>
        </test>
    </tests>
    <help>
        <![CDATA[

@CARDINAL_DESCRIPTION@

-----

This tool provides multiple tabular output options for mass spectrometry imaging data files. 

@MSIDATA_INPUT_DESCRIPTION@

@SPECTRA_TABULAR_INPUT_DESCRIPTION@

@MZ_TABULAR_INPUT_DESCRIPTION@

**Output options**

- intensity matrix: m/z in rows, spectra in columns, filled with intensity values
- spectra output: spectra in rows - for each spectrum: name, x and y coordinates,order, number of peaks (intensities > 0), total ion chromatogram (TIC), median intensity, maximum intensity, highest m/z feature per spectrum, optional count of m/z per spectrum, optional spectrum annotation
- mz feature output: m/z in rows - for each m/z: name, m/z, mean, median, standard deviation (sd), standard error of the mean (sem), sum of all intensities per m/z, number of peaks (intensity > 0) per m/z
- summarized intensities: pixel annotations will be used to group spectra into annotation groups and calculate mean, median and sd of the intensities per group

        ]]>
    </help>
    <expand macro="citations"/>
</tool>