cardinal_classification: classification.xml comparison

comparison classification.xml @ 14:ece627528a78 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit 888b3e991d0752b694bf480531ce0e5318c2f337-dirty"

author	galaxyp
date	Fri, 07 May 2021 10:10:35 +0000
parents	24c000517173
children	f28ad96b76dc

comparison

equal deleted inserted replaced

-:b18329a8ac14
+:ece627528a78
-<tool id="cardinal_classification" name="MSI classification" version="@VERSION@.0">
+<tool id="cardinal_classification" name="MSI classification" version="@VERSION@.1">
 <description>spatial classification of mass spectrometry imaging data</description>
 <macros>
 <import>macros.xml</import>
 </macros>
 <expand macro="requirements">
 ################################# load libraries and read file #########################
 library(Cardinal)
 library(gridExtra)
 library(ggplot2)
+library(scales)
 @READING_MSIDATA@
 msidata = as(msidata, "MSImageSet") ##coercion to MSImageSet
 title(main=paste0(Title," for file: \n\n", "$infile.display_name"))
 ##################### I) numbers and control plots #############################
-###############################################################################
+################################################################################
 ## table with values
 grid.table(property_df, rows= NULL)
 if (npeaks > 0 && sum(is.na(spectra(msidata)))==0){
 opar <- par()
-######################## II) Training #############################
+######################## II) Training #######################################
 #############################################################################
 #if str( $type_cond.type_method) == "training":
 print("training")
 msidata_coordinates = cbind(coord(msidata)[,1:2], c(1:ncol(msidata)))
 colnames(msidata_coordinates)[3] = "pixel_index"
 merged_response = merge(msidata_coordinates, y_input, by=c("x", "y"), all.x=TRUE)
 merged_response[is.na(merged_response)] = "NA"
 merged_response = merged_response[order(merged_response\$pixel_index),]
-y_vector = as.factor(merged_response[,4])
+conditions = as.factor(merged_response[,4])
+y_vector = conditions
 ## plot of y vector
-position_df = cbind(coord(msidata)[,1:2], y_vector)
+position_df = cbind(coord(msidata)[,1:2], conditions)
-y_plot = ggplot(position_df, aes(x=x, y=y, fill=y_vector))+
+y_plot = ggplot(position_df, aes(x=x, y=y, fill=conditions))+
 geom_tile() +
 coord_fixed()+
-ggtitle("Distribution of the response variable y")+
+ggtitle("Distribution of the conditions")+
-theme_bw()+
+		theme_bw()+
+theme(
+	       plot.background = element_blank(),
+	       panel.grid.major = element_blank(),
+	       panel.grid.minor = element_blank())+
 theme(text=element_text(family="ArialMT", face="bold", size=15))+
 theme(legend.position="bottom",legend.direction="vertical")+
 guides(fill=guide_legend(ncol=4,byrow=TRUE))
-coord_labels = aggregate(cbind(x,y)~y_vector, data=position_df, mean, na.rm=TRUE, na.action="na.pass")
+coord_labels = aggregate(cbind(x,y)~conditions, data=position_df, mean, na.rm=TRUE, na.action="na.pass")
-coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$y_vector)
+coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$conditions)
 print(y_plot)
 ## plot of folds
 position_df = cbind(coord(msidata)[,1:2], fold_vector)
 fold_plot = ggplot(position_df, aes(x=x, y=y, fill=fold_vector))+
 geom_tile() +
 coord_fixed()+
 ggtitle("Distribution of the fold variable")+
-theme_bw()+
+	       theme_bw()+
+theme(
+	       plot.background = element_blank(),
+	       panel.grid.major = element_blank(),
+	       panel.grid.minor = element_blank())+
 theme(text=element_text(family="ArialMT", face="bold", size=15))+
 theme(legend.position="bottom",legend.direction="vertical")+
 guides(fill=guide_legend(ncol=4,byrow=TRUE))
 coord_labels = aggregate(cbind(x,y)~fold_vector, data=position_df, mean, na.rm=TRUE, na.action="na.pass")
 coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$fold_vector)
 prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
 geom_tile() +
 coord_fixed()+
 ggtitle("Predicted condition for each pixel")+
-theme_bw()+
+			theme_bw()+
+		        theme(
+		       plot.background = element_blank(),
+		       panel.grid.major = element_blank(),
+		       panel.grid.minor = element_blank())+
 theme(text=element_text(family="ArialMT", face="bold", size=15))+
 theme(legend.position="bottom",legend.direction="vertical")+
 guides(fill=guide_legend(ncol=4,byrow=TRUE))
 coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
 coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
 prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
 geom_tile() +
 coord_fixed()+
 ggtitle("Predicted condition for each pixel")+
-theme_bw()+
+			theme_bw()+
+			theme(
+		       plot.background = element_blank(),
+		       panel.grid.major = element_blank(),
+		       panel.grid.minor = element_blank())+
 theme(text=element_text(family="ArialMT", face="bold", size=15))+
 theme(legend.position="bottom",legend.direction="vertical")+
 guides(fill=guide_legend(ncol=4,byrow=TRUE))
 coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
 coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
 s_value = as.numeric(substring(unlist(strsplit(best_params, ","))[3], 5)) ## remove space
 minimumy = min(coord(msidata.cv.ssc)[,2])
 maximumy = max(coord(msidata.cv.ssc)[,2])
 image(msidata.cv.ssc, model = list( r = r_value, s = s_value ), ylim= c(maximumy+0.2*maximumy,minimumy-0.2*minimumy),layout=c(1,1))
+		#if $type_cond.method_cond.ssc_analysis_cond.write_best_params:
+	write.table(r_value, file="$best_r", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
+	write.table(s_value, file="$best_s", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t")
+#end if
 ## print table with summary in pdf
 par(opar)
 plot(0,type='n',axes=FALSE,ann=FALSE)
 title(main="Summary for the different parameters\n", adj=0.5)
 ## 20 rows fits in one page:
 ## set variables for components and number of response groups
 number_groups = length(levels(y_vector))
 ## SSC analysis and plot
-msidata.ssc <- spatialShrunkenCentroids(msidata, y = y_vector, .fold = fold_vector,
+msidata.ssc <- spatialShrunkenCentroids(msidata, y = y_vector,
 r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s), method = "$type_cond.method_cond.ssc_kernel_method")
-plot(msidata.ssc, mode = "tstatistics", model = list("r" = c($type_cond.method_cond.ssc_r), "s" = c($type_cond.method_cond.ssc_s)))
+plot(msidata.ssc, mode = "tstatistics", model = list("r" = c($type_cond.method_cond.ssc_r), "s" = c($type_cond.method_cond.ssc_s)),
+			col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), lwd=2)
 ### summary table SSC
 ##############summary_table = summary(msidata.ssc)
 ### stop if multiple values for r and s were used as input
 maximumy = max(coord(msidata)[,2])
 print(image(msidata, mz = topFeatures(msidata.ssc)[1,1], normalize.image = "linear", contrast.enhance = "histogram",smooth.image="gaussian", ylim= c(maximumy+0.2*maximumy,minimumy-0.2*minimumy), main="best m/z heatmap"))
 ## m/z and pixel information output
 ssc_classes = data.frame(msidata.ssc\$classes[[1]])
+ssc_probabilities = data.frame(msidata.ssc\$probabilities[[1]])
 ## pixel names and coordinates
 ## to remove potential sample names and z dimension, split at comma and take only x and y
 x_coords = unlist(lapply(strsplit(names(pixels(msidata)), ","), `[[`, 1))
 y_coords = unlist(lapply(strsplit(names(pixels(msidata)), ","), `[[`, 2))
 ## remove msidata to clean up RAM space
 rm(msidata)
 gc()
-ssc_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, ssc_classes)
+ssc_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, ssc_classes, ssc_probabilities)
-colnames(ssc_classes2) = c("pixel names", "x", "y","predicted condition")
+colnames(ssc_classes2) = c("pixel names", "x", "y","predicted condition", levels(msidata.ssc\$classes[[1]]))
 ssc_toplabels = topFeatures(msidata.ssc, n=Inf)
 ssc_toplabels[,6:9] <-round(ssc_toplabels[,6:9],6)
 write.table(ssc_toplabels, file="$mzfeatures", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
 write.table(ssc_classes2, file="$pixeloutput", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
-## image with predicted classes
+image(msidata.ssc, model=list(r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s)),
-prediction_df = cbind(coord(msidata.ssc)[,1:2], ssc_classes)
+			col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), mode="classes", layout=c(1,1), main="Class Prediction")
-colnames(prediction_df) = c("x", "y", "predicted_classes")
+image(msidata.ssc, model=list(r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s)),
+			col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), mode="probabilities", layout=c(1,1), main="Class probabilities")
-prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
-geom_tile() +
-coord_fixed()+
-ggtitle("Predicted condition for each pixel")+
-theme_bw()+
-theme(text=element_text(family="ArialMT", face="bold", size=15))+
-theme(legend.position="bottom",legend.direction="vertical")+
-guides(fill=guide_legend(ncol=4,byrow=TRUE))
-coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
-coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
-print(prediction_plot)
 ## image with right and wrong classes:
+prediction_df = cbind(coord(msidata.ssc)[,1:2], ssc_classes)
+colnames(prediction_df) = c("x", "y", "predicted_classes")
 comparison_df = cbind(prediction_df, y_vector)
 comparison_df\$correct<- ifelse(comparison_df\$predicted_classes==comparison_df\$y_vector, T, F)
+	       correctness = round(sum(comparison_df\$correct)/length(comparison_df\$correct)*100,2)
 correctness_plot = ggplot(comparison_df, aes(x=x, y=y, fill=correct))+
 geom_tile() +
 coord_fixed()+
-ggtitle("Correctness of classification")+
+ggtitle(paste0("Correctness of classification: ",correctness, "%"))+
-theme_bw()+
+scale_fill_manual(values = c("TRUE" = "orange","FALSE" = "darkblue"))+
+		       theme_bw()+
+theme(
+		       plot.background = element_blank(),
+		       panel.grid.major = element_blank(),
+		       panel.grid.minor = element_blank())+
 theme(text=element_text(family="ArialMT", face="bold", size=15))+
 theme(legend.position="bottom",legend.direction="vertical")+
 guides(fill=guide_legend(ncol=2,byrow=TRUE))
 ## coord_labels = aggregate(cbind(x,y)~correct, data=comparison_df, mean, na.rm=TRUE, na.action="na.pass")
 ##coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
 merged_response = merge(msidata_coordinates, new_y_input, by=c("x", "y"), all.x=TRUE)
 merged_response[is.na(merged_response)] = "NA"
 merged_response = merged_response[order(merged_response\$pixel_index),]
 new_y_vector = as.factor(merged_response[,4])
 prediction = predict(training_data,msidata, newy = new_y_vector)
+	    ## Summary table prediction
+	    summary_table = summary(prediction)\$accuracy[[names(prediction@resultData)]]
+	    summary_table2 = round(as.numeric(summary_table), digits=2)
+	    summary_matrix = matrix(summary_table2, nrow=4, ncol=ncol(summary_table))
+	    summary_table3 = cbind(rownames(summary_table), summary_matrix) ## include rownames in table
+	    summary_table4 = t(summary_table3)
+	    summary_table5 = cbind(c(names(prediction@resultData),colnames(summary_table)), summary_table4)
+	    plot(0,type='n',axes=FALSE,ann=FALSE)
+	   grid.table(summary_table5, rows= NULL)
 #else
 prediction = predict(training_data,msidata)
 #end if
 predicted_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, predicted_classes)
 colnames(predicted_classes2) = c("pixel names", "x", "y","predicted condition")
 predicted_toplabels = topFeatures(prediction, n=Inf)
 if (colnames(predicted_toplabels)[4] == "coefficients"){
 predicted_toplabels[,4:6] <-round(predicted_toplabels[,4:6],5)
 }else{
 predicted_toplabels[,6:9] <-round(predicted_toplabels[,6:9],5)}
+##predicted classes
+prediction_df = cbind(coord(prediction)[,1:2], predicted_classes)
+colnames(prediction_df) = c("x", "y", "predicted_classes")
+#if str($type_cond.classification_type) == "SSC_classifier":
+## this seems to work only for SSC, therefore overwrite tables
+predicted_probabilities = data.frame(prediction\$probabilities[[1]])
+predicted_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, predicted_classes, predicted_probabilities)
+colnames(predicted_classes2) = c("pixel names", "x", "y","predicted condition", levels(prediction\$classes[[1]]))
+## also image modes are specific to SSC
+image(prediction, mode="classes", layout=c(1,1), main="Class", col=hue_pal()(length(unique(prediction\$classes[[1]]))))
+image(prediction, mode="probabilities", layout=c(1,1), main="Class probabilities", col=hue_pal()(length(unique(prediction\$classes[[1]]))))
+	#else
+prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
+	geom_tile()+
+	coord_fixed()+
+	ggtitle("Predicted condition for each spectrum")+
+	theme_bw()+
+	theme(
+	plot.background = element_blank(),
+	panel.grid.major = element_blank(),
+	panel.grid.minor = element_blank())+
+	theme(text=element_text(family="ArialMT", face="bold", size=15))+
+	theme(legend.position="bottom", legend.direction="vertical")+
+	guides(fill=guide_legend(ncol=4, byrow=TRUE))
+	coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
+	coord_labels\$file_number = gsub( "_.*§", "", coord_labels\$predicted_classes)
+	print(prediction_plot)
+#end if
 write.table(predicted_toplabels, file="$mzfeatures", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
 write.table(predicted_classes2, file="$pixeloutput", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t")
-## image with predicted classes
-prediction_df = cbind(coord(prediction)[,1:2], predicted_classes)
-colnames(prediction_df) = c("x", "y", "predicted_classes")
-prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+
-geom_tile() +
-coord_fixed()+
-ggtitle("Predicted condition for each pixel")+
-theme_bw()+
-theme(text=element_text(family="ArialMT", face="bold", size=15))+
-theme(legend.position="bottom",legend.direction="vertical")+
-guides(fill=guide_legend(ncol=4,byrow=TRUE))
-coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass")
-coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
-print(prediction_plot)
+#if str($type_cond.new_y_values_cond.new_y_values) == "new_response":
 ## image with right and wrong classes:
 	comparison_df = cbind(prediction_df, new_y_vector)
-	comparison_df\$correct<- as.factor(ifelse(comparison_df\$predicted_classes==comparison_df\$new_y_vector, T, F))
+	comparison_df\$correct<- ifelse(comparison_df\$predicted_classes==comparison_df\$new_y_vector, T, F)
+correctness = round(sum(comparison_df\$correct)/length(comparison_df\$correct)*100,2)
 	correctness_plot = ggplot(comparison_df, aes(x=x, y=y, fill=correct))+
 geom_tile()+
 scale_fill_manual(values = c("TRUE" = "orange","FALSE" = "darkblue"))+
 coord_fixed()+
-ggtitle("Correctness of classification")+
+ggtitle(paste0("Correctness of classification: ",correctness, "%"))+
 theme_bw()+
 theme(text=element_text(family="ArialMT", face="bold", size=15))+
 theme(legend.position="bottom",legend.direction="vertical")+
 guides(fill=guide_legend(ncol=2,byrow=TRUE))
-## coord_labels = aggregate(cbind(x,y)~correct, data=comparison_df, mean, na.rm=TRUE, na.action="na.pass")
-##coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes)
 print(correctness_plot)
+#end if
-## Summary table prediction
-summary_table = summary(prediction)\$accuracy[[names(prediction@resultData)]]
-summary_table2 = round(as.numeric(summary_table), digits=2)
-summary_matrix = matrix(summary_table2, nrow=4, ncol=ncol(summary_table))
-summary_table3 = cbind(rownames(summary_table), summary_matrix) ## include rownames in table
-summary_table4 = t(summary_table3)
-summary_table5 = cbind(c(names(prediction@resultData),colnames(summary_table)), summary_table4)
-plot(0,type='n',axes=FALSE,ann=FALSE)
-grid.table(summary_table5, rows= NULL)
 ## optional output as .RData
 #if $output_rdata:
 msidata = prediction
 save(msidata, file="$classification_rdata")
 <conditional name="ssc_analysis_cond">
 <param name="ssc_method" type="select" label="Analysis step to perform">
 <option value="ssc_cvapply" selected="True">cvApply</option>
 <option value="ssc_analysis">spatial shrunken centroids analysis</option>
 </param>
-<when value="ssc_cvapply"/>
+<when value="ssc_cvapply">
+<param name="write_best_params" type="boolean" label="Write out best r and s values" help="Can be used to generate automatic classification workflow"/>
+</when>
 <when value="ssc_analysis">
 <!--param name="ssc_toplabels" type="integer" value="100"
 label="Number of toplabels (m/z features) which should be written in tabular output"/-->
 </when>
 </conditional>
 <expand macro="sanitizer_multiple_digits"/>
 </param>
 <param name="ssc_kernel_method" type="select" display="radio" label = "The method to use to calculate the spatial smoothing kernels for the embedding. The 'gaussian' method refers to spatially-aware (SA) weights, and 'adaptive' refers to spatially-aware structurally-adaptive (SASA) weights">
 <option value="gaussian">gaussian</option>
 <option value="adaptive" selected="True">adaptive</option>
 </param>
 </when>
 </conditional>
 </when>
 <when value="prediction">
 <param name="training_result" type="data" format="rdata" label="Result from previous classification training"/>
 <!--param name="predicted_toplabels" type="integer" value="100"
 label="Number of toplabels (m/z features) which should be written in tabular output"/-->
+<param name="classification_type" type="select" display="radio" optional="False" label="Which classification method was used">
+	<option value="PLS_classifier" selected="True" >PLS classifier</option>
+	<option value="OPLS_classifier">OPLS classifier</option>
+	<option value="SSC_classifier">SSC_classifier</option>
+	</param>
 <conditional name="new_y_values_cond">
-<param name="new_y_values" type="select" label="Should new response values be used">
+<param name="new_y_values" type="select" label="Load annotations (optional, but allows accuracy calculations)">
-<option value="no_new_response" selected="True">old response should be used</option>
+<option value="no_new_response" selected="True">no</option>
-<option value="new_response">load new response from tabular file</option>
+<option value="new_response">use annotations</option>
 </param>
 <when value="no_new_response"/>
 <when value="new_response">
 <param name="new_response_file" type="data" format="tabular" label="Load tabular file with pixel coordinates and the new response"/>
 <param name="column_new_x" data_ref="new_response_file" label="Column with x values" type="data_column"/>
 </inputs>
 <outputs>
 <data format="pdf" name="classification_images" from_work_dir="classificationpdf.pdf" label = "${tool.name} on ${on_string}: results"/>
 <data format="tabular" name="mzfeatures" label="${tool.name} on ${on_string}: features"/>
 <data format="tabular" name="pixeloutput" label="${tool.name} on ${on_string}: pixels"/>
+<data format="txt" name="best_r" label="${tool.name} on ${on_string}:best r">
+<filter>type_cond['type_method'] == 'training' and type_cond['method_cond']['class_method'] == 'spatialShrunkenCentroids' and type_cond['method_cond']['ssc_analysis_cond']['ssc_method'] == 'ssc_cvapply' and type_cond['method_cond']['ssc_analysis_cond']['write_best_params']</filter>
+</data>
+<data format="txt" name="best_s" label="${tool.name} on ${on_string}:best s">
+<filter>type_cond['type_method'] == 'training' and type_cond['method_cond']['class_method'] == 'spatialShrunkenCentroids' and type_cond['method_cond']['ssc_analysis_cond']['ssc_method'] == 'ssc_cvapply' and type_cond['method_cond']['ssc_analysis_cond']['write_best_params']</filter>
+</data>
 <data format="rdata" name="classification_rdata" label="${tool.name} on ${on_string}: results.RData">
 <filter>output_rdata</filter>
 </data>
 </outputs>
 <tests>
 - O-PLS-DA: Orthogonal partial least squares discriminant analysis
 - Spatial shrunken centroids (more details in `Bemis et al. <https://doi.org/10.1074/mcp.O115.053918>`_)
 - training and prediction
 - training can be done with cvapply that uses cross validation to find the best value for s, this requires not only a condition for each spectrum but also a fold (each fold should contain spectra of all conditions)
-- training with the best value for s gives the top m/z features for each condition and the predicted classification group for each spectrum
+- training with the best value for r and s gives the top m/z features for each condition and the predicted classification group for each spectrum
 - training result can be saved as RData file that can be reused for prediction of further samples
+- prediction can calculate accuracies when the annotations are known and provided
 .. image:: $PATH_TO_IMAGES/classification_overview.png
 :width: 1000
 :height: 465

Mercurial > repos > galaxyp > cardinal_classification

comparison classification.xml @ 14:ece627528a78 draft