| Next changeset 1:748b7a8b634c (2016-04-21) |
|
Commit message:
Uploaded |
|
added:
Frequency-COSMICv72-Hupki.txt R/compareSignature_Galaxy.r R/estimateSign_Galaxy.r R/mutationSpectra_Galaxy.r R/somaticSignature_Galaxy.r R/transciptionalStrandBias.r README.txt hg19_listAVDB.txt mm9_listAVDB.txt mutspecAnnot.pl mutspecAnnot.xml mutspecAnnot_wrapper.sh mutspecCompare.xml mutspecCompare_wrapper.sh mutspecFilter.pl mutspecFilter.xml mutspecNmf.xml mutspecNmf_wrapper.sh mutspecSplit.pl mutspecSplit.xml mutspecStat.pl mutspecStat.xml mutspecStat_wrapper.sh tool-data/annovar_index.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b Frequency-COSMICv72-Hupki.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Frequency-COSMICv72-Hupki.txt Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,97 @@\n+Substitution Type\tTrinucleotide\tSomatic Mutation Type\tSignature 1\tSignature 2\tSignature 3\tSignature 4\tSignature 5\tSignature 6\tSignature 7\tSignature 8\tSignature 9\tSignature 10\tSignature 11\tSignature 12\tSignature 13\tSignature 14\tSignature 15\tSignature 16\tSignature 17\tSignature 18\tSignature 19\tSignature 20\tSignature 21\tSignature 22\tSignature 23\tSignature 24\tSignature 25\tSignature 26\tSignature 27\tSignature 28\tSignature 29\tSignature 30\tSignature 1 MEF\tSignature 2 MEF\tSignature 3 MEF\tSignature 5 MEF\n+C>A\tACA\tA[C>A]A\t0.0110983262\t0.0006827082\t0.0221723068\t0.0365\t0.0149415477\t0.0017\t0.0004\t0.0367180038\t0.012\t0.0007\t0.0002\t0.0077\t0.0003347572\t0.0001\t0.0013\t0.0161\t0.0018320192\t0.0505364186\t0.0107\t0.0011799616\t0.0001\t0.0015040704\t0.0004533607\t0.0286459925\t0.009896768\t0.0020397729\t0.0052056269\t0.0013974388\t0.0699819873\t0\t0.000781083\t0.0037229109\t0.0283533537\t0.0003710632\n+C>A\tACC\tA[C>A]C\t0.0091493407\t0.0006191072\t0.0178716754\t0.0309\t0.008960918\t0.0028\t0.0005\t0.0332457222\t0.0067\t0.001\t0.001\t0.0047\t0.0006487361\t0.0042\t0.004\t0.0097\t0.0003422356\t0.0109398248\t0.0074\t0.0022115051\t0.0007\t0.002451011\t0.0003668005\t0.0202146384\t0.0069989288\t0.0014871623\t0.0047382274\t0.0009171877\t0.0551523572\t0\t0.0022972224\t0.0070460466\t0.015676074\t0.001672691\n+C>A\tACG\tA[C>A]G\t0.0014900705\t0.000099279\t0.0021383396\t0.0183\t0.002207846\t0.0005\t0\t0.0025253113\t0.0005\t0.0003\t0\t0.0017\t3.8144594E-005\t0.0005\t0\t0.0022\t1.576225E-006\t0.0022880727\t0.0005\t1.61691E-007\t0\t0\t0\t0.0204789965\t0.001448443\t0.0002839456\t0.0007826979\t0\t0.017846984\t0.0019673\t0.0031701397\t0.0025537924\t0.0272331284\t0.0007812591\n+C>A\tACT\tA[C>A]T\t0.0062338852\t0.0003238914\t0.0162651456\t0.0243\t0.0092069053\t0.0019\t0.0004\t0.0335985495\t0.0068\t0.0092\t0.0002\t0.0046\t0.0008466585\t0.0296\t0.0057\t0.0088\t0.0031796648\t0.0194240914\t0.0074\t0.00300801\t0.0006\t0.0009224525\t0\t0.0246001454\t0.004966565\t0.0005978656\t0.0027182425\t0.00051341\t0.026804716\t0\t0.0015620621\t0.0104484061\t0.0079498813\t0.0004287024\n+C>A\tCCA\tC[C>A]A\t0.0065958701\t0.000677445\t0.0187817256\t0.0461\t0.0096749043\t0.0101\t0.0012\t0.0317237566\t0.0098\t0.0031\t0.0007\t0.0135\t0.0017100896\t0.0056\t0.0106\t0.0159\t0.0010324302\t0.0887681088\t0.0112\t0.0173771106\t0.002\t0.0045496929\t0.0001647394\t0.0635592838\t0.0148329479\t0.0037058501\t0.0050650733\t0.0011685156\t0.0514102117\t0\t0.0094052338\t0.0035222831\t0.0414403498\t0.0050521059\n+C>A\tCCC\tC[C>A]C\t0.0073423678\t0.000213681\t0.0157604578\t0.0614\t0.0049523006\t0.0241\t0.0006\t0.0255054071\t0.0057\t0.0009\t0.0017\t0.0112\t0.0011592566\t0.0102\t0.0084\t0.01\t0.0004218801\t0.0206413906\t0.0159\t0.036502463\t0.0014\t0.0037644739\t0.0007368748\t0.0337570047\t0.0078221753\t0.0039807234\t0.0022341533\t0.0003342918\t0.0258256508\t0\t0.0031118726\t0.0059886212\t0.0390237536\t0.0007556355\n+C>A\tCCG\tC[C>A]G\t0.0008928404\t6.77046E-006\t0.0019633898\t0.0088\t0.0028006273\t0.0091\t0\t0.0011596243\t0\t0.0007\t0.001\t0.0028\t0.0002441665\t0.0009\t0.0015\t0.0022\t0.0002974628\t0.0171784025\t0.0018\t0.0124825875\t0.0027\t0.0009001633\t0.0001639537\t0.0224289858\t0.0012769767\t0.000811742\t0.0002663122\t0.000053652\t0.0144961833\t0.0022624\t0.0031056722\t0.0027351313\t0.0444175322\t0.000426758\n+C>A\tCCT\tC[C>A]T\t0.0071865816\t0.0004163329\t0.0147228611\t0.0432\t0.0110134658\t0.0571\t0.0013\t0.028791173\t0.0091\t0.016\t0.0014\t0.0071\t0.0012567682\t0.1257\t0.0228\t0.0084\t3.1479429E-005\t0.0376769589\t0.0096\t0.1034012262\t0.0056\t0.0044398462\t0.0007227318\t0.0200865154\t0.0125636547\t0.0190384313\t0.00310057\t0.0001866719\t0.0403550741\t0\t0.0069708534\t0.0135089174\t0.0262144919\t0.0012449334\n+C>A\tGCA\tG[C>A]A\t0.008232604\t0.0003520134\t0.0096965397\t0.0376\t0.011892169\t0.0024\t0.0003\t0.0236823289\t0.0118\t0.0014\t0.0004\t0.0062\t0.0001321096\t0.0018\t0.0024\t0.0096\t0.0065354049\t0.1287241581\t0.0032\t0.0011161238\t0.0001\t0.0012983702\t0.0003499075\t0.0546764487\t0.0134652951\t0.0013753118\t0.0107558719\t0.0021366291\t0.0780466101\t0.008853\t0.0038823793\t0.0123398664\t0.0263977944\t0.0008119624\n+C>A\tGCC\tG[C>A]C\t0.0057580214\t0.0001338169\t0.0108433411\t0.0399\t0.0092478575\t0.0058\t0.0001\t0.0158218964\t0.0092\t0.0022\t0.001\t0.0056\t0.000754244\t0.0114\t0.0099\t0.0094\t0.001293804\t0.016'..b'0.0095898419\t0.0011168548\t0.0046233\t3.27595444415171E-020\t3.70860800590008E-020\t0.0005262194\t0.0004210208\n+T>G\tCTC\tC[T>G]C\t0.0020985024\t2.2095087E-005\t0.0058242955\t0.0013\t0.0034785021\t0.004\t0.0008\t0.0020962106\t0.0064\t0.0018\t0.0001\t0.0049\t0.0001476248\t0.002\t0.0019\t0.0082\t0.0198142936\t0.0020156969\t0.0011\t0.0084764945\t0.0005\t0.0010645836\t0.0001901443\t0.0021854373\t0.0028864543\t0.0039194551\t0.000381245\t0.0339132237\t0\t0.0060004\t0.0039803758\t0.0025376549\t0.0060513622\t2.28346914246428E-020\n+T>G\tCTG\tC[T>G]G\t0.0015995485\t0.0002282459\t0.0104646545\t0.0046\t0.0071474562\t0.005\t0.0009\t0.0066040463\t0.0126\t0.0037\t0.0011\t0.0045\t4.59873E-005\t0.0005\t0.0007\t0.0067\t0.0134498376\t0.0053119673\t0.0013\t0.0153837761\t0.0004\t0.0041885681\t0\t0.0001139097\t0.0117694925\t0.0054973573\t0.0004032572\t0.0185164816\t0.0021696933\t0.0073775\t0.0039200627\t0.0028967736\t0.007564699\t0.0003827374\n+T>G\tCTT\tC[T>G]T\t0.0027585376\t6.711134E-005\t0.0087243873\t0.0012\t0.0114868115\t0.0086\t0.0013\t0.0048667139\t0.0509\t0.0182\t0.0009\t0.0063\t0.0004637147\t0.0063\t0.0045\t0.0186\t0.2614566141\t0.0023416292\t0.0019\t0.0021853947\t0.0032\t0.0016294096\t0.000210835\t0.0025337183\t0.0085508075\t0.0067887187\t0.0014390961\t0.118967103\t0.0011436633\t0.0091481\t0.0157841091\t0.012870271\t0.0262293556\t0.0019670653\n+T>G\tGTA\tG[T>G]A\t0.000099045\t9.5552392E-005\t0.004144488\t0\t0.0016276645\t0\t0\t0.0009745787\t0.0072\t0\t0\t0\t1.8489857E-005\t0\t0\t0\t3.9193821E-005\t0.0013415325\t0\t3.579149E-006\t0\t0\t0\t0\t0.0055462269\t3.7393232E-005\t0\t0\t0\t0.0032461\t0.0015356557\t0.0019915953\t0.0015778632\t0.0008546281\n+T>G\tGTC\tG[T>G]C\t0.0002023656\t4.7002381E-005\t0.0045019853\t0\t0.0003277349\t0.0016\t0\t0.0005248216\t0.0006\t0.002\t0\t0.0004\t9.3373513E-005\t0.0019\t0.0022\t0.0032\t0.0090784615\t9.20431E-007\t0\t0.0028305751\t0.0006\t0\t0\t0.0010736323\t0\t0.002460705\t5.0790565E-005\t0\t0\t0.001869\t0.0007606153\t0.0041116159\t0.0032012845\t0.0008454968\n+T>G\tGTG\tG[T>G]G\t0.0011883532\t0.0001099257\t0.0163914526\t0.0018\t0.0059488798\t0.001\t0.0017\t0.0060877535\t0.005\t0.0009\t0.001\t0.0011\t1.0579194E-005\t0.0012\t0.0037\t0.0008\t0.0047827755\t0.009695\t0.0043\t0.0027332194\t0.0004\t0.0018775892\t0.0001871324\t0.002924697\t0.0086852444\t0.0008172016\t0.0047429186\t0.0042178083\t0.0036405938\t0.0033445\t0.0054079152\t3.70860800590008E-020\t0.002512037\t0.0012687784\n+T>G\tGTT\tG[T>G]T\t0.0008007233\t8.647718E-005\t0.0070672366\t0.0002\t0.0033074666\t0.0035\t0.0009\t0.0054274338\t0.0185\t0.003\t0\t0.0032\t5.3741207E-005\t0.0038\t0.019\t0.0044\t0.0634977566\t0.0028905535\t0.0025\t0.0035585586\t0.0008\t0\t0\t0.0008250983\t0.0027685717\t0.0078335613\t0.002298476\t0.0316489973\t0.0061810238\t0.0056069\t0.0101885638\t0.0066566288\t0.0079578565\t0.0029254388\n+T>G\tTTA\tT[T>G]A\t0.0013975537\t0.000071737\t0.0054271842\t0\t0.0052028744\t0.0009\t0\t0.0017432214\t0.0502\t0.005\t0\t0.0019\t0.0005465818\t0\t0\t0.0068\t0.0001334106\t3.6318404E-005\t0.0032\t2.69147E-007\t0\t0.0016516988\t0\t0\t0.0020814799\t8.534935E-006\t0.0011890225\t0.0093896587\t0\t0.0086563\t0.0008187477\t0.000948451\t0.0005293442\t2.28346914246428E-020\n+T>G\tTTC\tT[T>G]C\t0.001291737\t1.4281456E-005\t0.0061602504\t0.0003\t0.0051316079\t0.0019\t0.001\t0.0025498383\t0.0081\t0.0092\t0\t0.0027\t0.0002353471\t0.0015\t0.0004\t0.0069\t0.0096133452\t0.003233838\t0.0018\t0.0003772344\t0.0018\t0\t0\t0\t0.0005789788\t0.0027185098\t0.0002802954\t0.030117077\t0\t0.0043282\t0.0015336834\t0.0010831046\t0.0010211012\t2.28346914246428E-020\n+T>G\tTTG\tT[T>G]G\t0.0020310769\t0.0002066152\t0.0110765263\t0.003\t0.0060552541\t0.0011\t0.001\t0.0060303952\t0.0088\t0.0022\t0.0003\t0.0011\t0.000000479\t0.0002\t0.0009\t0.0049\t0.0045224623\t0.0007546018\t0.0011\t0.0005154216\t0.0003\t0.002572752\t0.0002475019\t0.0013605049\t0.0094291959\t0.0013691612\t0.0023530556\t0.0126987508\t0.000353696\t0.0082628\t0.0015835907\t3.70860800590008E-020\t0.003717572\t2.28346914246428E-020\n+T>G\tTTT\tT[T>G]T\t0.0040301282\t2.3598204E-005\t0.0130009842\t0.0011\t0.0133699358\t0.0072\t0.0014\t0.0072239989\t0.0545\t0.0633\t0.0003\t0.0032\t0.0006705883\t0.0025\t0.0033\t0.0163\t0.0580404078\t0.0021264415\t0.0013\t0.0006156567\t0.0003\t0\t0\t6.9515778E-005\t0.0078696716\t0.0025680767\t0.0001395613\t0.2336597833\t0.0061048341\t0\t0.0031734006\t0.001871395\t0.0032018465\t2.28346914246428E-020\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b R/compareSignature_Galaxy.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/compareSignature_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,125 @@ +#!/usr/bin/Rscript + +#-----------------------------------# +# Author: Maude # +# Script: compareSignature_Galaxy.r # +# Last update: 29/10/15 # +#-----------------------------------# + + +######################################################################################################################################### +# Compare new signatures with published one using the cosine similarity method # +######################################################################################################################################### + + +#------------------------------------------------------------------------------- +# Print a usage message if there is no argument pass to the command line +#------------------------------------------------------------------------------- +args <- commandArgs(TRUE) +usage <- function() +{ + msg <- paste0('Usage:\n', + ' compareSignature_Galaxy.r Published_Signature New_Signature Output_Folder\n' + ) + cat(msg, '\n', file="/dev/stderr") + quit(status=1) +} + +input = args[length(args)] + +if (length(args) == 0) { usage() } + + +#------------------------------------------------------------------------------- +# Load library +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(library(lsa))) +suppressMessages(suppressWarnings(library(ggplot2))) +suppressMessages(suppressWarnings(library(reshape))) + +#------------------------------------------------------------------------------- +# Recover the arguments +#------------------------------------------------------------------------------- +published_signature_file <- args[1] # The matrix with the published signatures +unknown_signature_file <- args[2] # The matrix W from NMF from which we want to compare the signatures +dir <- args[3] # html directory + + +#------------------------------------------------------------------------------- +# Set the variables +#------------------------------------------------------------------------------- +# Create the outputs +output_cosineRes <- paste0(dir, "/Similarity_Matrix.txt") +output_png <- paste0(dir, "/Similarity_Matrix.png") + + +#------------------------------------------------------------------------------- +# Calculate the cosine similarity and represent it with a heatmap +#------------------------------------------------------------------------------- +# Published signatures +dataFrame1 <- read.table(published_signature_file, header=T, sep="\t") +# Remove the first three colmumns (Substitution Type, Trinucleotide Somatic, Mutation Type) +dataFrame1 <- dataFrame1[,4:ncol(dataFrame1)] +matrix1 <- as.matrix(dataFrame1) + +# Unkown signatures +dataFrame2 <- read.table(unknown_signature_file, header=T, sep="\t") +# Remove the first two columns (alteration, context) +dataFrame2 <- dataFrame2[,3:ncol(dataFrame2)] +matrix2 <- as.matrix(dataFrame2) +# Recover the number of new signatures +NbNewSignature <- ncol(dataFrame2) - 1 + +# Combined the two matrices (published and unknown signatures) +input_matrix_cos <- cbind(matrix1, matrix2) +# Calculate the cosine similarity +cosine_res <- cosine(input_matrix_cos) + +# Keep only the comparison between the two matrices +nbSign <- ncol(matrix1)+1 # +1 for havng the first signature of the matrix1 +cosine_res_subset <- cosine_res[nbSign:nrow(cosine_res), 1:ncol(matrix1)] + +# Save the matrix +write.table(cosine_res_subset, file=output_cosineRes, quote=F, sep="\t", col.names=T, row.names=T) + +# Transform the matrix in a suitable format for ggplot2 +cosineRes_subset_melt <- melt(cosine_res_subset) +# Rename the columns +colnames(cosineRes_subset_melt) <- c("Unknown_Signatures", "Published_Signatures", "Similarity") +# Reorder the Signature for having the same order as in the matrix. Turn your 'signature' column into a character vector +cosineRes_subset_melt$Published_Signatures <- as.character(cosineRes_subset_melt$Published_Signatures) +#Then turn it back into an ordered factor +cosineRes_subset_melt$Published_Signatures <- factor(cosineRes_subset_melt$Published_Signatures, levels=rev(unique(cosineRes_subset_melt$Published_Signature))) + +# Base plot: heatmap +p1 <- ggplot(cosineRes_subset_melt, aes(x=Published_Signatures, y=Unknown_Signatures, fill=Similarity)) + geom_tile(colour="yellow") +scale_fill_gradientn(colours=c("yellow", "red")) + theme_classic() + +# Rename the signatures +if(basename(published_signature_file) == "Frequency-COSMICv72-Hupki.txt") +{ + p1 <- p1 + scale_x_discrete(breaks = c("Signature.1", "Signature.2", "Signature.3", "Signature.4", "Signature.5", "Signature.6", "Signature.7", "Signature.8", "Signature.9", + "Signature.10", "Signature.11", "Signature.12", "Signature.13", "Signature.14", "Signature.15", "Signature.16", "Signature.17", + "Signature.18", "Signature.19", "Signature.20", "Signature.21", "Signature.22", "Signature.23", "Signature.24", "Signature.25", + "Signature.26", "Signature.27", "Signature.28", "Signature.29", "Signature.30", + "Signature.1.MEF", "Signature.2.MEF", "Signature.3.MEF", "Signature.5.MEF"), + labels = c("(Age) Sign 1", "(AID/APOBEC) Sign 2", "(BRCA1/2) Sign 3", "(Smoking) Sign 4", "Sign 5", "(DNA MMR deficiency) Sign 6", "(UV) Sign 7", + "Sign 8", "(IgG) Sign 9", "(pol e) Sign 10", "(temozolomide) Sign 11", "Sign 12", "(AID/APOBEC) Sign 13", "Sign 14", + "(DNA MMR deficiency) Sign 15", "Sign 16", "Sign 17", "Sign 18", "Sign 19", "(DNA MMR deficiency) Sign 20", "Sign 21", "(AA) Sign 22", + "Sign 23", "(Aflatoxin) Sign 24", "Sign 25", "(DNA MMR deficiency) Sign 26", "Sign 27", "Sign 28", "(Tobacco chewing) Sign 29", "Sign 30", + "(AA) Sign 1 MEF", "(AID) Sign 2 MEF", "(BaP) Sign 3 MEF", "(MNNG) Sign 5 MEF") + ) +} + +# Flipped cartesian coordinates so that horizontal becomes vertical, and vertical, horizontal +p1 <- p1 + coord_flip() +# Remove the x axis line +p1 <- p1 + theme(axis.line.x=element_blank(), axis.line.y=element_blank()) +# Add the cosine value only if >= 0.9 +cosResLabel <- subset(cosineRes_subset_melt, round(cosineRes_subset_melt$Similarity, digits=2) >= 0.9) # Subset the data for keeping only the values greater thant 0.9 +p1 <- p1 + geom_text(data = cosResLabel, aes(x = Published_Signatures, y = Unknown_Signatures, label = round(cosResLabel$Similarity, 2))) + +graphics.off() +options(bitmapType='cairo') +png(output_png, width=3000, height=2000, res=300) +plot(p1) +invisible( dev.off() ) |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b R/estimateSign_Galaxy.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/estimateSign_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,105 @@ +#!/usr/bin/Rscript + +#-----------------------------------# +# Author: Maude # +# Script: estimateSign_Galaxy.r # +# Last update: 22/07/15 # +#-----------------------------------# + +######################################################################################################################################### +# Estimate the number of signatures for NMF # +######################################################################################################################################### + +#------------------------------------------------------------------------------- +# Load library for recovering the arguments +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(require("getopt"))) + + +#------------------------------------------------------------------------------- +# Recover the arguments +#------------------------------------------------------------------------------- +spec = matrix(c( + "input" , "i", 1, "character", + "stop", "stop", 1, "numeric", + "cpu", "cpu", 1, "integer", + "output", "o", 1, "character", + "help", "h", 0, "logical" + ), + byrow=TRUE, ncol=4 + ) + +opt = getopt(spec); + +# No argument is pass to the command line +if(length(opt) == 1) +{ + cat(paste("Usage:\n estimateSign_Galaxy.r --input <matrix> --stop <maxNbSign> --cpu <cpu> --output <output_filename.png>\n",sep="")) + q(status=1) +} + +# Help was asked for. +if ( !is.null(opt$help) ) +{ + # print a friendly message and exit with a non-zero error code + cat(paste("Usage:\n estimateSign_Galaxy.r --input <matrix> --stop <maxNbSign> --cpu <cpu> --output <output_filename.png>\n",sep="")) + q(status=1) +} + + + +#------------------------------------------------------------------------------- +# Load library +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(library(NMF))) + + + ############################################################################### + # Load the functions # + ############################################################################### + +#------------------------------------------------------------------------------- +# Check the file doesn't have lines equal to zero +#------------------------------------------------------------------------------- +CheckFile <- function(rowsum, dataFrame, x) +{ + if(rowsum == 0) + { + write("\n\nERROR: There is not enough mutations for running NMF!!!", stderr()) + write(paste0("Input matrix contains at least one null row ", rownames(dataFrame)[x], "\n\n"), stderr()) + stop() + } +} + + + ############################################################################### + # Check file # + ############################################################################### + +# The input musn't contains lines equal to zero !!! +matrixNMF <- read.table(opt$input, header=T) +# suppresses the return of sapply function +invisible( sapply(1:nrow(matrixNMF), function(x) { CheckFile(rowSums(matrixNMF)[x], matrixNMF, x) } ) ) + + + + ############################################################################### + # Estimate the number of signatures # + ############################################################################### +# Estimate the number of signatures with our data +nbCPU <- paste0("vP", opt$cpu) +nbSign <- 2:opt$stop # The minum number of signatures can't be lower than 2 + +estim_r <- nmf(matrixNMF, method="brunet", nbSign, nrun=50, .opt=nbCPU) + +# Shuffle original data +v_random <- randomize(matrixNMF) +# Estimate quality measures from the shuffled data +estim_r_random <- nmf(v_random, method="brunet", nbSign, nrun=50, .opt=nbCPU) + +# Plot the estimation for our data and the random ones +graphics.off() +options(bitmapType='cairo') +png(opt$output, width=3000, height=2000, res=300) +plot(estim_r, estim_r_random) +invisible( dev.off() ) |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b R/mutationSpectra_Galaxy.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/mutationSpectra_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,203 @@\n+#!/usr/bin/Rscript\n+\n+#-----------------------------------#\n+# Author: Maude #\n+# Script: mutationSpectra_Galaxy.r #\n+# Last update: 23/07/15 #\n+#-----------------------------------#\n+\n+#########################################################################################################################################\n+# Represent the mutation spectra with a bar graph #\n+#########################################################################################################################################\n+\n+#-------------------------------------------------------------------------------\n+# Print a usage message if there is no argument pass to the command line\n+#-------------------------------------------------------------------------------\n+args <- commandArgs(TRUE)\n+usage <- function() \n+{\n+ msg <- paste0(\'Usage:\\n\',\n+ \' mutationSpectra_Galaxy.r input_Mutation_Spectra Sample_Name Output_Folder_High_Resolution Output_Folder_Low_Resolution Count_ca Count_cg Count_ta Count_tc Count_tg\\n\',\n+ \'\\ninput_Mutation_Spectra should be tab-separated: alteration context value\\n\',\n+ \'\\nOutput_Folder_High_Resolution: Folder for saving the high resolution image (display on the HTML page)\\n\',\n+ \'\\nOutput_Folder_Low_Resolution: Folder for saving the low resolution image (display on the Excel report)\\n\'\n+ )\n+ cat(msg, \'\\n\', file="/dev/stderr")\n+ quit(status=1)\n+}\n+\n+input = args[length(args)]\n+\n+if (length(args) == 0) { usage() }\n+\n+\n+#-------------------------------------------------------------------------------\n+# Load library\n+#-------------------------------------------------------------------------------\n+suppressMessages(suppressWarnings(library(ggplot2)))\n+suppressMessages(suppressWarnings(library(reshape)))\n+suppressMessages(suppressWarnings(library(grid)))\n+suppressMessages(suppressWarnings(library(scales)))\n+suppressMessages(suppressWarnings(library(gridExtra)))\n+\n+\n+\n+#-------------------------------------------------------------------------------\n+# Recover the arguments\n+#-------------------------------------------------------------------------------\n+input <- args[1]\n+sampleName <- args[2]\n+output_html <- args[3]\n+output_report <- args[4]\n+count_ca <- as.numeric(args[5])\n+count_cg <- as.numeric(args[6])\n+count_ct <- as.numeric(args[7])\n+count_ta <- as.numeric(args[8])\n+count_tc <- as.numeric(args[9])\n+count_tg <- as.numeric(args[10])\n+\n+count_ca <- paste("C>A (", count_ca, ")")\n+count_cg <- paste("C>G (", count_cg, ")")\n+count_ct <- paste("C>T (", count_ct, ")")\n+count_ta <- paste("T>A (", count_ta, ")")\n+count_tc <- paste("T>C (", count_tc, ")")\n+count_tg <- paste("T>G (", count_tg, ")")\n+\n+\n+\n+ ###############################################################################\n+ # Load the functions #\n+ ###############################################################################\n+\n+#-------------------------------------------------------------------------------\n+# Set the font depending on X11 availability\n+#-------------------------------------------------------------------------------\n+font <- ""\n+# Check the device available\n+device <- capabilities()\n+# X11 is available\n+if(device[5]) { font <- "Helvetica" } else { font <- "mono" }\n+\n+#-------------------------------------------------------------------------------\n+# My own thme\n+#-------------------------------------------------------------------------------\n+theme_custom <- function(base_size = 12, base_'..b'<- ggplot(matrixW_melt, aes(x=context, y=value, fill=alteration)) + geom_bar(stat="identity", width=0.5) + facet_grid(variable ~ alteration, scales="free_y")\n+# Color the mutation like Alexandrov et al.\n+p <- p + scale_fill_manual(values=c("blue", "black", "red", "#828282", "#00CC33", "pink"))\n+# Remove the legend\n+p <- p + guides(fill=FALSE)\n+# customized theme (no background, no facet grid and strip, y axis only)\n+p <- p + mytheme\n+# Remove the title of the x facet strip\n+p <- p + theme(strip.text.x=element_blank(), strip.text.y=element_blank())\n+# Remove the x axis label, thicks and title\n+p <- p + theme(axis.title.x=element_blank(), axis.ticks.x=element_blank(), axis.title.y=element_text(size=15))\n+# Scale the y axis to the maximum value\n+p <- p + scale_y_continuous(limits=c(0,max_matrixW), oob=squish, breaks=c(0,max_matrixW), labels=fmt())\n+# Rename the y axis\n+p <- p + ylab("percent")\n+# Add a title to the plot\n+p <- p + ggtitle(sampleName) + theme(plot.title = element_text(vjust = 3.4, family=font))\n+# Add a top margin for writing the title of the plot\n+p <- p + theme(plot.margin=unit(c(.7,0,0,0), "cm"))\n+p <- p + scale_x_discrete(breaks = c("A_A","A_C","A_G","A_T", "C_A","C_C","C_G","C_T", "G_A","G_C","G_G","G_T", "T_A","T_C","T_G","T_T"),\n+\t\t labels =c(\'A\\nA\',"\\nC","\\nG","\\nT", \'C\\nA\',"\\nC","\\nG","\\nT",\n+\t\t\t \t \'G\\nA\',"\\nC","\\nG","\\nT", \'T\\nA\',"\\nC","\\nG","\\nT"\n+ )\n+ )\n+\n+#------------------------------------------------------------------------------------------------------------------------------\n+# Change the color of the facets for the mutation type\n+#------------------------------------------------------------------------------------------------------------------------------\n+cols <- rep( c("blue", "black", "red", "#828282", "#00CC33", "pink")) # Facet strip colours\n+\n+# Make a grob object\n+Pg <- ggplotGrob(p)\n+# To keep track of strip.background grobs\n+idx <- 0\n+# Find each strip.background and alter its backround colour\n+for( g in 1:length(Pg$grobs) )\n+{\n+\tif( grepl( "strip.absoluteGrob" , Pg$grobs[[g]]$name ) )\n+\t{\n+\t\tidx <- idx + 1\n+\t\tsb <- which( grepl( "strip\\\\.background" , names( Pg$grobs[[g]]$children ) ) )\n+\t\tPg$grobs[[g]]$children[[sb]][]$gp$fill <- cols[idx]\n+\t}\n+}\n+\n+# Reduce the size of the facet strip\n+Pg$heights[[3]] = unit(.1,"cm")\n+\n+\n+# Save the plot for the HTML page (higher resolution)\n+graphics.off() # close graphics windows\n+# Use cairo device as isn\'t possible to install X11 on the server...\n+png(paste0(output_html, "/", sampleName, "-MutationSpectraPercent-Genomic.png"), width=3500, heigh=500, res=300, type=c("cairo-png"))\n+plot(Pg)\n+## Add label for the mutation type above the strip facet\n+grid.text(0.13, unit(0.90,"npc") - unit(1,"line"), label=count_ca)\n+grid.text(0.29, unit(0.90,"npc") - unit(1,"line"), label=count_cg)\n+grid.text(0.45, unit(0.90,"npc") - unit(1,"line"), label=count_ct)\n+grid.text(0.6, unit(0.90,"npc") - unit(1,"line"), label=count_ta)\n+grid.text(0.76, unit(0.90,"npc") - unit(1,"line"), label=count_tc)\n+grid.text(0.92, unit(0.90,"npc") - unit(1,"line"), label=count_tg)\n+invisible( dev.off() )\n+\n+# Save the plot for the report\n+png(paste0(output_report, "/", sampleName, "-MutationSpectraPercent-Genomic-Report.png"), width=1000, heigh=150, type=c("cairo-png"))\n+plot(Pg)\n+## Add label for the mutation type above the strip facet\n+grid.text(0.13, unit(0.90,"npc") - unit(1,"line"), label=count_ca)\n+grid.text(0.29, unit(0.90,"npc") - unit(1,"line"), label=count_cg)\n+grid.text(0.45, unit(0.90,"npc") - unit(1,"line"), label=count_ct)\n+grid.text(0.6, unit(0.90,"npc") - unit(1,"line"), label=count_ta)\n+grid.text(0.76, unit(0.90,"npc") - unit(1,"line"), label=count_tc)\n+grid.text(0.92, unit(0.90,"npc") - unit(1,"line"), label=count_tg)\n+invisible( dev.off() )\n+\n+# Delete the empty plot created by the script\n+if (file.exists("Rplots.pdf")) invisible( file.remove("Rplots.pdf") )\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b R/somaticSignature_Galaxy.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/somaticSignature_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,467 @@\n+#!/usr/bin/Rscript\n+\n+#-----------------------------------#\n+# Author: Maude #\n+# Script: somaticSignature_Galaxy.r #\n+# Last update: 29/07/15 #\n+#-----------------------------------#\n+\n+\n+#########################################################################################################################################\n+# Run NMF algorithm and represent the composition of somatic signatures and the contribution in each samples #\n+#########################################################################################################################################\n+\n+#-------------------------------------------------------------------------------\n+# Load library for recovering the arguments\n+#-------------------------------------------------------------------------------\n+suppressMessages(suppressWarnings(require("getopt")))\n+\n+\n+#-------------------------------------------------------------------------------\n+# Recover the arguments\n+#-------------------------------------------------------------------------------\n+spec = matrix(c(\n+ "input" , "i", 1, "character",\n+ "nbSignature", "nbSign", 1, "integer",\n+ "cpu", "cpu", 1, "integer",\n+ "output", "o", 1, "character",\n+ "help", "h", 0, "logical"\n+ ),\n+ byrow=TRUE, ncol=4\n+ )\n+\n+opt = getopt(spec);\n+\n+# No argument is pass to the command line\n+if(length(opt) == 1)\n+{\n+ cat(paste("Usage:\\n somaticSignature_Galaxy.r --input <matrix> --nbSignature <nbSign> --cpu <cpu> --output <outputdir>\\n",sep=""))\n+ q(status=1)\n+}\n+\n+# Help was asked for.\n+if ( !is.null(opt$help) )\n+{\n+ # print a friendly message and exit with a non-zero error code\n+ cat(paste("Usage:\\n somaticSignature_Galaxy.r --input <matrix> --nbSignature <nbSign> --cpu <cpu> --output <outputdir>\\n",sep=""))\n+ q(status=1)\n+}\n+\n+\n+\n+#-------------------------------------------------------------------------------\n+# Load library\n+#-------------------------------------------------------------------------------\n+suppressMessages(suppressWarnings(library(NMF)))\n+suppressMessages(suppressWarnings(library(ggplot2)))\n+suppressMessages(suppressWarnings(library(reshape)))\n+suppressMessages(suppressWarnings(library(grid)))\n+suppressMessages(suppressWarnings(library(scales))) # Set the maximum value to the y axis (graph composition somatic signature)\n+suppressMessages(suppressWarnings(library(gridExtra))) # function "unit"\n+\n+\n+\n+ ###############################################################################\n+ # Load the functions #\n+ ###############################################################################\n+\n+#-------------------------------------------------------------------------------\n+# Set the font depending on X11 availability\n+#-------------------------------------------------------------------------------\n+font <- ""\n+# Check the device available\n+device <- capabilities()\n+# X11 is available\n+if(device[5]) { font <- "Helvetica" } else { font <- "Helvetica-Narrow" }\n+\n+#-------------------------------------------------------------------------------\n+# My own theme\n+#-------------------------------------------------------------------------------\n+theme_custom <- function(base_size = 4, base_family = "")\n+{\n+ # Starts with theme_grey and then modify some parts\n+ theme_grey(base_size = base_size, base_family = base_family) %+replace%\n+ theme(\n+ axis.text = element_text(size = rel(0.8), family=font),\n+ axis.ticks = element_line(colour = "black", size=.2),\n+ axis.line = element_line(colour = "black", size = .2),\n+ axis.ticks.length= unit(.05, "cm"),\n+ axis.tic'..b' number of SBS\n+matrixH_norm_melt$ContriSBS <- sapply(1:nrow(matrixH_norm_melt), function(x) { Contri2SignSBS(matrixH_norm_melt$Total_SBS[x], matrixH_norm_melt$Value[x]) } )\n+\n+\n+# Save the matrix\n+write.table(matrixH_norm_melt, file=output_matrixH_ggplot2, quote=F, sep="\\t", col.names=T, row.names=F)\n+\n+# Base plot for the contribution of each samples according the count of mutations\n+p2 <- ggplot(matrixH_norm_melt, aes(x=reorder(Sample, -ContriSBS), y=ContriSBS, fill=Signature)) + geom_bar(stat="identity") + theme_classic()\n+# Remove the name of samples\n+p2 <- p2 + theme(axis.text.x = element_blank()) \n+# Reverse the y axis\n+p2 <- p2 + scale_y_reverse()\n+# Rename the y and x axis\n+p2 <- p2 + ylab("Number of mutations") + xlab("Samples")\n+# Remove the x axis line\n+p2 <- p2 + theme(axis.line.x=element_blank())\n+\n+# Base plot for the contribution of each samples in percentages\n+p3 <- ggplot(matrixH_norm_melt, aes(x=reorder(Sample, -ContriSBS), y=Value, fill=Signature)) + geom_bar(stat="identity") + theme_classic() + theme(axis.text.x = element_blank()) + xlab("") + ylab("% of mutations")\n+# Remove the x axis line\n+p3 <- p3 + theme(axis.line.x=element_blank(), axis.ticks.x=element_blank())\n+\n+\n+# Plot PNG\n+png(figure_matrixH_png, width=3000, heigh=2000, res=300)\n+# Combined the two plots for the contribution of the samples\n+suppressWarnings( grid_arrange_shared_legend(p3, p2) )\n+invisible( dev.off() )\n+\n+\n+ ###############################################################################\n+ # Average contributions of each signature in each cluster #\n+ ###############################################################################\n+\n+matrixH_cluster <- cbind(matrix_cluster[,1], t(matrixH_norm))\n+colnames(matrixH_cluster) <- c("Cluster", colnames(t(matrixH_norm)))\n+\n+df <- as.data.frame(matrixH_cluster)\n+\n+tmp_mat <- sapply(1:opt$nbSignature, function(x) { meanCluster(df[df[,1] == x,]) } )\n+# Add a name for the row and the col\n+rownames(tmp_mat) <- sapply(1:opt$nbSignature, function(x) { paste0("Sig. ", x) } )\n+colnames(tmp_mat) <- sapply(1:opt$nbSignature, function(x) { paste0("Cluster ", x) } )\n+tmp_mat <- t(tmp_mat)\n+# Recover the number of samples in each cluster\n+nbSampleByCluster <- sapply(1:opt$nbSignature, function(x) { as.numeric( strsplit( as.character(dim(df[df[,1] == x,])), " " ) ) } )\n+# Combined the average contribution and the number of samples\n+tmp_mat <- cbind(tmp_mat, nbSampleByCluster[1,])\n+# Add a name for the row and the col\n+colnames(tmp_mat)[opt$nbSignature+1] <- "Number of samples"\n+# Save the matrix\n+write.table(tmp_mat, file=output_matrixH_cluster, quote=F, sep="\\t", col.names=T, row.names=T)\n+\n+## Create an image of the table with ggplot2\n+# Dummy plot\n+p4 <- qplot(1:10, 1:10, geom = "blank") + \n+ theme(panel.grid.major = element_blank(),\n+ panel.grid.minor = element_blank(),\n+ panel.border = element_rect(fill=NA,color="white", size=0.5, linetype="solid"),\n+ axis.line = element_blank(),\n+ axis.ticks = element_blank(),\n+ panel.background = element_rect(fill="white"),\n+ plot.background = element_rect(fill="white"),\n+ legend.position = "none", \n+ axis.text = element_blank(),\n+ axis.title = element_blank()\n+ )\n+# Adding a table\n+p4 <- p4 + annotation_custom(grob = tableGrob(tmp_mat),\n+ xmin = 4, xmax = 7,\n+ ymin = 0, ymax = 10)\n+\n+# Save the table\n+png(figure_matrixH_cluster, width=2500, heigh=1000, res=300)\n+# Combined the two plots for the contribution of the samples\n+plot(p4)\n+invisible( dev.off() )\n+\n+\n+# Delete the empty plot created by the script\n+if (file.exists("Rplots.pdf")) invisible( file.remove("Rplots.pdf") )\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b R/transciptionalStrandBias.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/transciptionalStrandBias.r Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,144 @@\n+#!/usr/bin/Rscript\n+\n+#---------------------------------------------#\n+# Author: Maude #\n+# Script: transcriptionalStrandBias_Galaxy.r #\n+# Last update: 03/07/15 #\n+#---------------------------------------------#\n+\n+#########################################################################################################################################\n+# Transcriptional strand bias #\n+#########################################################################################################################################\n+\n+#-------------------------------------------------------------------------------\n+# Print a usage message if there is no argument pass to the command line\n+#-------------------------------------------------------------------------------\n+args <- commandArgs(TRUE)\n+usage <- function() \n+{\n+ msg <- paste0(\'Usage:\\n\',\n+ \' transcriptionalStrandBias_Galaxy.r input Output_Folder_High_Resolution Output_Folder_Low_Resolution Label_Y_axis\\n\',\n+ \'\\ninput should be tab-separated: MutationTypeContext Strand Value Sample\\n\',\n+ \'\\nOutput_Folder_High_Resolution: Folder for saving the high resolution image (display on the HTML page)\\n\',\n+ \'\\nOutput_Folder_Low_Resolution: Folder for saving the low resolution image (display on the Excel report)\\n\',\n+ \'\\nLabel_Y_axis: can be Count or Percent\'\n+ )\n+ cat(msg, \'\\n\', file="/dev/stderr")\n+ quit(status=1)\n+}\n+\n+input = args[length(args)]\n+\n+if (length(args) == 0) { usage() }\n+\n+\n+\n+#-------------------------------------------------------------------------------\n+# Load library\n+#-------------------------------------------------------------------------------\n+suppressMessages(suppressWarnings(library(ggplot2)))\n+suppressMessages(suppressWarnings(library(gridExtra)))\n+\n+\n+#-------------------------------------------------------------------------------\n+# Recover the argument pass in the command line\n+#-------------------------------------------------------------------------------\n+input <- args[1]\n+output <- args[2]\n+output_temp <- args[3] # Temp folder for the plot present in the Excel report\n+legend_y_axis <- args[4]\n+\n+\n+#-------------------------------------------------------------------------------\n+# Create the plot\n+#-------------------------------------------------------------------------------\n+## Load the data\n+txnSB <- read.table(input, header=T)\n+## Define the color for the transcribed (blue) and non-transcribed strand(red)\n+cb_palette_SB <- c("#0072B2", "#CC0000")\n+## Reorder the mutation on the x axis (same order as NMF)\n+txnSB$MutationTypeContext <- factor(txnSB$MutationTypeContext,\n+ levels=c(\n+ "C>A:A_A","C>A:A_C","C>A:A_G","C>A:A_T","C>A:C_A","C>A:C_C","C>A:C_G","C>A:C_T","C>A:G_A","C>A:G_C","C>A:G_G","C>A:G_T","C>A:T_A","C>A:T_C","C>A:T_G","C>A:T_T",\n+ "C>G:A_A","C>G:A_C","C>G:A_G","C>G:A_T","C>G:C_A","C>G:C_C","C>G:C_G","C>G:C_T","C>G:G_A","C>G:G_C","C>G:G_G","C>G:G_T","C>G:T_A","C>G:T_C","C>G:T_G","C>G:T_T",\n+ "C>T:A_A","C>T:A_C","C>T:A_G","C>T:A_T","C>T:C_A","C>T:C_C","C>T:C_G","C>T:C_T","C>T:G_A","C>T:G_C","C>T:G_G","C>T:G_T","C>T:T_A","C>T:T_C","C>T:T_G","C>T:T_T",\n+ "T>A:A_A","T>A:A_C","T>A:A_G","T>A:A_T","T>A:C_A","T>A:C_C","T>A:C_G","T>A:C_T","T>A:G_A","T>A:G_C","T>A:G_G","T>A:G_T","T>A:T_A","T>A:T_C","T>A:T_G","T>A:T_T",\n+ "T>C:A_A","T>C:A_C","T>C:A_G","T>C:A_T","T>C:C_A","T>C:C_C","T>C:C_G","T>C:C_T","T>C:G_A","T>C:G_C","T>C:G_G","T>C:G_T","T>C:T_A","T>C:T_C","T>C:T_G","T>C:T_T",\n+ "T>G:A_A","T>G:A_C","T>G:A_G","T>G:A_T","T>G:C_A","T>G:C_C","T>G:C_G","T>G:C_T","T>G:G_A","T>G:G_C","T>G:G_G","T>G:G_T","T>G:T_A","T>G:T_C","T>G:T_G","T>G:T_T"\n+ \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t)\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'..b'"G>A:C_G","G>A:C_T","G>A:G_A","G>A:G_C","G>A:G_G","G>A:G_T","G>A:T_A","G>A:T_C","G>A:T_G","G>A:T_T",\n+ "G>C:A_A","G>C:A_C","G>C:A_G","G>C:A_T","G>C:C_A","G>C:C_C","G>C:C_G","G>C:C_T","G>C:G_A","G>C:G_C","G>C:G_G","G>C:G_T","G>C:T_A","G>C:T_C","G>C:T_G","G>C:T_T",\n+ "G>T:A_A","G>T:A_C","G>T:A_G","G>T:A_T","G>T:C_A","G>T:C_C","G>T:C_G","G>T:C_T","G>T:G_A","G>T:G_C","G>T:G_G","G>T:G_T","G>T:T_A","G>T:T_C","G>T:T_G","G>T:T_T",\n+ "T>A:A_A","T>A:A_C","T>A:A_G","T>A:A_T","T>A:C_A","T>A:C_C","T>A:C_G","T>A:C_T","T>A:G_A","T>A:G_C","T>A:G_G","T>A:G_T","T>A:T_A","T>A:T_C","T>A:T_G","T>A:T_T",\n+ "T>C:A_A","T>C:A_C","T>C:A_G","T>C:A_T","T>C:C_A","T>C:C_C","T>C:C_G","T>C:C_T","T>C:G_A","T>C:G_C","T>C:G_G","T>C:G_T","T>C:T_A","T>C:T_C","T>C:T_G","T>C:T_T",\n+ "T>G:A_A","T>G:A_C","T>G:A_G","T>G:A_T","T>G:C_A","T>G:C_C","T>G:C_G","T>G:C_T","T>G:G_A","T>G:G_C","T>G:G_G","T>G:G_T","T>G:T_A","T>G:T_C","T>G:T_G","T>G:T_T"\n+ \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n+ \t\t\t\t\t\t\t\t\t\tlabels=c(\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T",\n+ "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T"\n+ \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t)\n+ \t\t\t\t\t\t\t\t\t )\n+## Changing the appearance of x axis thicks\n+p_txnSB <- p_txnSB + theme(axis.text.x = element_text(angle=60, hjust=1, vjust=1))\n+## Close graphics windows\n+graphics.off()\n+## Save the plot for the HTML page (higher resolution)\n+options(bitmapType=\'cairo\') # # Use cairo device as isn\'t possible to install X11 on the server...\n+png(paste0(output, ".png"), width=4000, height=1000, res=300)\n+plot(p_txnSB)\n+# Add a label bellow the bar graph for indicating the mutation type\n+grid.text(paste("C>A", sep=""), x=unit(.14, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10))\n+grid.text(paste("C>G", sep=""), x=unit(.29, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10))\n+grid.text(paste("C>T", sep=""), x=unit(.45, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10))\n+grid.text(paste("T>A", sep=""), x=unit(.58, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10))\n+grid.text(paste("T>C", sep=""), x=unit(.74, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10))\n+grid.text(paste("T>G", sep=""), x=unit(.9, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10))\n+invisible( dev.off() )\n+\n+\n+\n+# Save the plot for the report\n+p_txnSB\n+ggsave(paste0(output_temp, "-Report.png"), width=18)\n+\n+# Delete the empty plot created by the script\n+if (file.exists("Rplots.pdf")) invisible( file.remove("Rplots.pdf") )\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,76 @@ +============================== + MutSpec-Suite +============================== + +Created by Maude Ardin and Vincent Cahais (Mechanisms of Carcinogenesis Section, International Agency for Research on Cancer F69372 Lyon France, http://www.iarc.fr/) + +Version 1.0 + +Released under GNU public license version 2 (GPL v2) + +Package description: Ardin et al. - 2016 - MutSpec: a Galaxy toolbox for streamlined analyses of somatic mutation spectra in human and mouse cancer genomes - BMC Bioinformatics + +Test data: https://usegalaxy.org/u/maude-ardin/p/mutspectestdata + + +### Requirements + + # python-dev +build-essential and python-dev packages must be installed on your machine before installing MutSpec tools: +$ sudo apt-get install build-essential python-dev + + + # Annovar +If you do not have ANNOVAR installed, you can download it here: http://www.openbioinformatics.org/annovar/annovar_download_form.php + +1) Once downloaded, install annovar per the installation instructions and edit the PATH variable in galaxy deamon (/etc/init.d/galaxy) to reflect the location of directory containing perl scripts. + +2) Create directories for saving Annovar databases + 2-a Create a folder (annovardb) for saving all Annovar databases, e.g. hg19db + 2-b Create a subfolder (seqFolder) for saving the reference genome, e.g. hg19db/hg19_seq + +3) Download the reference genome (by chromosome) from UCSC for all desired builds as follows: +$ annotate_variation.pl -buildver <build> -downdb seq <seqFolder> + +where <build> can be hg18, hg19 or hg38 for the human genome or mm9, mm10 for the mouse genome. +and <seqFolder> is the location where the sequences (by chromosme) should be stored, e.g. hg19db/hg19_seq + + +4) Download all desired databases for all desired builds as follows: +$ annotate_variation.pl -buildver <build> [-webfrom annovar] -downdb <database> <annovardb> + +/!\ At least the database refGene must be downloaded /!\ + +where <build> can be hg18, hg19 or hg38 for the human genome or mm9, mm10 for the mouse genome. +and <database> is the database file to download, e.g. refGene +and <annovardb> is the location where all database files should be stored, e.g. hg19db + +The list of all available databases can be found here: http://annovar.openbioinformatics.org/en/latest/user-guide/download/ + + +5) Edit the annovar_index.loc file (in the folder galaxy-dist/tool-data/toolshed/repos/iarc/mutspec/revision/) to reflect the location of annovardb folder (containing all the databases files downloaded from Annovar). +Restart galaxy instance for changes in .loc file to take effect or reload it into the admin interface. + +6) Edit the file build_listAVDB.txt in the mutspec install directory to reflect the name and the type of the databases installed + + +### Installation + + # MutSpec-Stat and MutSpec-NMF +By default 1 CPU is used by these tools, but you may edit mutspecStat_wrapper.sh and mutspecNmf_wrapper.sh to change this number to the maximum number of CPU available on your server. + +MutSpec-Stat and MutSpec-NMF tools allow parallel computations that are time consuming. +It is recommended to use the highest number of cores available on the Galaxy server to reduce the computation time of these tools. + + + + # MutSpec-Annot +The maximum CPU value needs to be specified when installing MutSpec package by editing the file mutspecAnnot.pl to reflect the maximum number of CPU available on your server (by default 1 CPU is used). + +This tool may be time consuming for large files. For example, annotating a file with more than 25,000 variants takes 1 hour using 1 CPU (2.6 GHz), while annotating this file using 8 CPUs takes only 5 minutes. +We have optimized MutSpec-Annot so that the tool uses more CPUs, if available, as follows: +-files with less than 5,000 lines: 1 CPU is used +-files with more than 5,000 and less than 25,000 lines: 2 CPUs are used +-files with more than 25,000 and less than 100,000 lines: 8 (or maximum CPUs, if less than 8 CPUs are available) are used (our benchmark results didn't show any time saving using more than 8 cores for files with more than 25,000 +but less than 100,000 lines) +-files with more than 100,000: maximum CPUs are used |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b hg19_listAVDB.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hg19_listAVDB.txt Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that is used by the +#MutSpec-Annot tools. The hg19_listAVDB.txt has this format (white space +#characters are TAB characters): +# +#<RefGenome_DatabaseName> <operation> +# +# +# +#hg19_refGene.txt g +#hg19_genomicSuperDups.txt r +#hg19_snp138NonFlagged.txt f +hg19_refGene.txt g +hg19_knownGene.txt g +hg19_ensGene.txt g +hg19_cytoBand.txt r +hg19_gwasCatalog.txt r +hg19_genomicSuperDups.txt r +hg19_snp138.txt f +hg19_snp138NonFlagged.txt f +hg19_ALL.sites.2015_08.txt f +hg19_AFR.sites.2015_08.txt f +hg19_AMR.sites.2015_08.txt f +hg19_EAS.sites.2015_08.txt f +hg19_EUR.sites.2015_08.txt f +hg19_SAS.sites.2015_08.txt f +hg19_esp6500siv2_all.txt f +hg19_esp6500siv2_aa.txt f +hg19_esp6500siv2_ea.txt f +hg19_ljb26_sift.txt f +hg19_ljb26_pp2hdiv.txt f +hg19_ljb26_pp2hvar.txt f +hg19_cosmic70.txt f +hg19_exac03.txt f +hg19_exac03nontcga.txt f +hg19_exac03nonpsych.txt f +hg19_kaviar20150923.txt f +hg19_hrcr1.txt f |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mm9_listAVDB.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mm9_listAVDB.txt Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,17 @@ +#This is a sample file distributed with Galaxy that is used by the +#MutSpec-Annot tools. The mm9_listAVDB.txt has this format (white space +#characters are TAB characters): +# +#<RefGenome_DatabaseName> <operation> +# +# +# +#mm9_refGene.txt g +#mm9_genomicSuperDups.txt r +#mm9_snp128.txt f +mm9_refGene.txt g +mm9_knownGene.txt g +mm9_ensGene.txt g +mm9_cytoBand.txt r +mm9_genomicSuperDups.txt r +mm9_snp128.txt f |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecAnnot.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecAnnot.pl Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,1209 @@\n+#!/usr/bin/env perl\n+\n+#-----------------------------------#\n+# Author: Maude #\n+# Script: mutspecAnnot.pl #\n+# Last update: 17/02/16 #\n+#-----------------------------------#\n+\n+use strict;\n+use warnings;\n+use Getopt::Long;\n+use Pod::Usage;\n+use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\\.[^.]*/);\n+use File::Path;\n+use Parallel::ForkManager;\n+\n+\n+our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.\n+our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files\n+our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context.\n+our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines)\n+\n+GetOptions(\'verbose|v\'=>\\$verbose, \'help|h\'=>\\$help, \'man|m\'=>\\$man, \'refGenome=s\'=>\\$refGenome, \'interval=i\' => \\$intervalEnd, \'fullAnnotation=s\' => \\$fullAVDB, \'outfile|o=s\' => \\$output, \'pathAnnovarDB|AVDB=s\' => \\$path_AVDB, \'pathAVDBList=s\' => \\$pathAVDBList, \'pathTemporary|temp=s\' => \\$folder_temp) or pod2usage(2);\n+\n+our ($input) = @ARGV;\n+\n+pod2usage(-verbose=>1, -exitval=>1, -output=>\\*STDERR) if ($help);\n+pod2usage(-verbose=>2, -exitval=>1, -output=>\\*STDERR) if ($man);\n+pod2usage(-verbose=>0, -exitval=>1, -output=>\\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script\n+pod2usage(-verbose=>0, -exitval=>1, -output=>\\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)\n+\n+\n+\n+######################################################################################################################################################\n+#\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tGLOBAL VARIABLES\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t #\n+######################################################################################################################################################\n+\n+#########################################\n+### SPECIFY THE NUMBER OF CPU ###\n+#########################################\n+our $max_cpu = 1; # Max number of CPU to use for the annotation\n+\n+\n+# Recover the current path\n+our $pwd = `pwd`;\n+chomp($pwd);\n+\n+# Input file path\n+our @pathInput = split("/", $input);\n+# Output directories\n+our ($folderMutAnalysis, $folderAnnovar) = ("", "");\n+# File with the list of Annovar databases to use\n+our $listAVDB = "";\n+# Initialisation of chromosome, position, ref and alt values\n+our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a");\n+\n+\n+######################################################################################################################################################\n+#\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMAIN \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t #\n+######################################################################################################################################################\n+## Check the presence of the flags and create the output and temp directories\n+CheckFlags();\n+\n+## Format the file in the correct format if they are vcf or MuTect output and recover the column positions\n+FormatingInputFile();\n+\n+# Annotate the file with Annovar, add the strand orientation and the sequence context\n+FullAnnotation();\n+\n+######################################################################################################################################################\n+#\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFUNCTIONS\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t #\n+########################################################'..b'_cat_mt_results." > $output";\n+\t`$cmd_cat_mt_results`;\n+}\n+\n+\n+sub recoverNumCol\n+{\n+\tmy ($input, $name_of_column) = @_;\n+\n+ open(F1,$input) or die "recoverNumCol: $!: $input\\n";\n+ # For having the name of the columns\n+ my $search_header = <F1>; $search_header =~ s/[\\r\\n]+$//; my @tab_search_header = split("\\t",$search_header);\n+ close F1;\n+ # The number of the column\n+ my $name_of_column_NB = "toto";\n+ for(my $i=0; $i<=$#tab_search_header; $i++)\n+ {\n+ if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }\n+ }\n+ if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn\'t exits in the input file $input!!!!!\\n"; exit; }\n+ else { return $name_of_column_NB; }\n+}\n+\n+\n+\n+\n+=head1 NAME\n+\n+mutspec-Annot\n+\n+=head1 SYNOPSIS\n+\n+\tmutspecannot.pl [arguments] <query-file>\n+\n+ <query-file> can be a folder with multiple VCF or a single VCF\n+\n+ Arguments:\n+ -h, --help print help message\n+ -m, --man print complete documentation\n+ -v, --verbose use verbose output\n+ --refGenome the reference genome to use\n+ --interval <interger> the number of bases for the sequence context\n+ -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file\n+ -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size\n+ --pathAVDBList the path to the list of AV databases installed\n+ -temp --pathTemporary <string> the path for saving the temporary files\n+ --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no)\n+\n+\n+Function: automatically run a pipeline on a list of variants and annote them using Annovar\n+\n+ Example: # Annotation only\n+ mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input\n+\n+\n+ Version: 02-2016 (Feb 2016)\n+\n+\n+=head1 OPTIONS\n+\n+=over 8\n+\n+=item B<--help>\n+\n+print a brief usage message and detailed explanation of options.\n+\n+=item B<--man>\n+\n+print the complete manual of the program.\n+\n+=item B<--verbose>\n+\n+use verbose output.\n+\n+\n+=item B<--refGenome>\n+\n+the reference genome to use, could be hg19 or mm9.\n+\n+=item B<--interval>\n+\n+the number of bases surrounding the mutated bases, for the sequence context analysis.\n+\n+=item B<--outfile>\n+\n+the directory of output file names. If it is nor specify the same directory as the input file is used.\n+\n+=item B<--pathAnnovarDB>\n+\n+the path to the directory containing the Annovar databases and the files with the chromosome size.\n+\n+=item B<--pathAVDBList>\n+\n+the path to a texte file containing the list of the Annovar databases installed.\n+\n+=item B<--pathTemporary>\n+\n+the path for saving temporary files generated by the script.\n+If any is specify a temporary folder is created in the same directory where the script is running.\n+Deleted when the script is finish\n+\n+=item B<--fullAnnotation>\n+\n+Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines)\n+\n+=head1 DESCRIPTION\n+\n+MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS.\n+Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added.\n+A text tab delimited file is produced.\n+\n+=cut\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecAnnot.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecAnnot.xml Tue Apr 19 03:07:11 2016 -0400 |
| b |
| b'@@ -0,0 +1,172 @@\n+<tool id="mutSpecannot" name="MutSpec Annot" version="0.1" hidden="false">\n+<description>Annotate variants with ANNOVAR and other databases</description>\n+\n+<requirements>\n+ <requirement type="set_environment">SCRIPT_PATH</requirement>\n+ <requirement type="package" version="5.18.1">perl</requirement>\n+</requirements>\n+\n+<command interpreter="bash">\n+ mutspecAnnot_wrapper.sh\n+ $output\n+ --refGenome ${refGenome}\n+ --AVDB ${refGenome.fields.path}\n+ --interval $interval\n+ --fullAnnotation ${annotation_type}\n+ $input\n+</command>\n+\n+<inputs>\n+\t<param name="input" type="data" format="txt" label="Input file" help="Select a single file, multiple files or a dataset collection"/>\n+\t\n+\t<param name="refGenome" type="select" label="Reference genome" help="Select the reference genome that was used for generating your data">\n+ <options from_data_table="annovar_index" />\n+ </param>\n+ \n+\t<param name="interval" type="text" value="10" label="Sequence context of variants" help="Number of retrieved bases that flank variants in 5\' and 3\'"/>\n+\n+ <param name="annotation_type" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Complete annotations" help="Select No if you have a file with millions of variants and you are just interested in having a quick overview of the mutational spectrum. Only the annotation from refGene, the strand orientation and the sequence context will be added." />\n+\n+</inputs>\n+\n+<outputs>\n+\t<data name="output" type="data" format="tabular" label="${input.name} annotated" />\n+</outputs> \n+\n+\n+<stdio>\n+ <regex match="ANNOVAR LOG FILE"\n+ source="stdout"\n+ level="fatal"\n+ description="Read Annovar log file for more information" />\n+</stdio>\n+\n+<help>\n+\n+**What it does**\n+\n+MutSpect-Annot provides functional annotations from `ANNOVAR software`__ (June 2015 version is provided here), as well as the strand transcript orientation (from refGene database) and sequence context of variants (extrated from the reference genome selected).\n+\n+.. __: http://www.openbioinformatics.org/annovar/\n+\n+--------------------------------------------------------------------------------------------------------------------------------------------------\n+\n+**Input formats**\n+\n+MutSpect-Annot accepts files in VCF (version 4.1) or in tab-delimited (TAB) format.\n+\n+.. class:: infomark\n+\n+TIP: If your data is not TAB delimited, use *Text manipulation -> convert*\n+\n+.. class:: warningmark\n+\n+Filenames must be <= 31 characters.\n+\n+.. class:: warningmark\n+\n+These files should contain at least four columns describing for each variant, the chromosome number, the start genomic position, the reference allele and the alternate allele\n+\n+.. class:: warningmark \n+\n+The tool supports different column names (**names are case-sensitive**) depending on the source file as follows:\n+\n+**mutect** : contig position ref_allele alt_allele\n+\n+**vcf** : CHROM POS REF ALT\n+\n+**cosmic** : Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic\n+\n+**icgc** : chromosome chromosome_start reference_genome_allele mutated_to_allele\n+\n+**tcga** : Chromosome Start_position Reference_Allele Tumor_Seq_Allele2\n+\n+**ionTorrent** : chr Position Ref Alt \n+\n+**proton** : Chrom Position Ref Variant \n+\n+**varScan2** : Chrom Position Ref VarAllele\n+\n+**annovar** : Chr Start Ref Obs \n+\n+**custom** : Chromosome Start Wild_Type Mutant\n+\n+.. class:: infomark\n+\n+For MuTect output files, only confident calls are considered (variants containing the string REJECT in the judgement column are not annotated and excluded from the MutSpect-Annot output) as other calls are very likely to be dubious calls or artefacts.\n+\n+.. class:: infomark\n+\n+For COSMIC and ICGC files, variants are reported on several transcripts. These duplicate variants need'..b"annotations are retrieved:\n+\n+**ANNOVAR annotations**\n+\n+An example of annotations retrieved by the tool.\n+\n+Gene-based: RefSeqGene, UCSC Known Gene and Ensembl Gene\n+\n+Region-based: localization of the variant on cytogenetic band (cytoBand), variant reported in Genome-Wide association studies (gwasCatalog) and variant mapped to segmental duplications (genomicSuperDups)\n+\n+Filter-based: \n+\n+ - dbSNP: For human genome there is two versions available: the defaul version (snp) and a pre-filtered version (snpNonFlagged). In the pre-filtered version all SNPs ‹ 1% minor allele frequency (MAF) (or unknown), mapping only once to reference assembly, or flagged in dbSnp as clinically associated are removed from the full dbSNP database and therefore not present in this version.\n+\n+ - 1000 Genomes Project (ALL, AFR (African), AMR (Admixed American), EAS (East Asian), EUR (European), SAS (South Asian))\n+\n+ - ESP: Exome Sequencing Project (ALL, AA (African American), EA (European American))\n+\n+ - ExAC: Exome Aggregation Consortium (ALL, AFR (African), AMR (Admixed American), EAS (East Asian), FIN (Finnish), NFE (Non-finnish European), OTH (other), SAS (South Asian))\n+\n+ - LJB26: SIFT, PolyPhen-2 (HDIV and HVAR)\n+\n+**Transcript orientation**\n+\n+The strand annotation corresponding to transcript orientation within genic regions is recovered from RefSeqGene database.\n+\n+**Sequence context**\n+\n+Flanking bases in both sides in 5' and 3' of the variant position retrieved from the reference genome used.\n+\n+--------------------------------------------------------------------------------------------------------------------------------------------------\n+\n+**Example**\n+\n+Annotate the following file::\n+\n+ Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2\n+ chr7 121717919 121717920 - G\n+ chr1 230846235 230846235 T A\n+ chr14 33290999 33290999 A G\n+ chr12 8082458 8082458 C T\n+ chr4 70156391 70156391 T C\n+\n+Will produce::\n+\n+ Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups snp138 1000g2014oct_all esp6500si_all Strand context Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2\n+ chr7 121717919 121717920 - G exonic AASS frameshift insertion AASS:NM_005763:exon23:c.2634dupC:p.A879fs NA rs147476318 NA NA - GCG chr7 121717919 121717920 - G\n+ chr1 230846235 230846235 T A exonic AGT nonsynonymous SNV AGT:NM_000029:exon2:c.A362T:p.H121L NA NA NA NA - GTG chr1 230846235 230846235 T A\n+ chr14 33290999 33290999 A G exonic AKAP6 nonsynonymous SNV AKAP6:NM_004274:exon13:c.A3980G:p.D1327G NA NA NA NA + GAC chr14 33290999 33290999 A G\n+ chr12 8082458 8082458 C T exonic SLC2A3 nonsynonymous SNV SLC2A3:NM_006931:exon6:c.G683A:p.R228Q NA rs200481428 0.000199681 NA - CCG chr12 8082458 8082458 C T\n+ chr4 70156391 70156391 T C exonic UGT2B28 nonsynonymous SNV UGT2B28:NM_053039:exon5:c.T1172C:p.V391A score=0.949699;Name=chr4:70035680 NA 0.000199681 NA + GTA chr4 70156391 70156391 T C\n+\n+\n+</help>\n+\n+</tool>\n" |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecAnnot_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecAnnot_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,26 @@ +#!/bin/bash + +output=$1;shift +refg=$2 +input=${9} + +command -v table_annovar.pl >/dev/null 2>&1 || { + echo "ERROR : table_annovar.pl not found. Add annovar scripts to your galaxy path !" ; + return 1 ; +} + +mkdir out +name=${input##*/} +name=${name%%.*} + +perl $SCRIPT_PATH/mutspecAnnot.pl \ + --outfile out \ + --pathAVDBList $SCRIPT_PATH \ + --temp "./temp" \ + $* 2>&1 + +ls out/Mutational_Analysis/Annovar/ +cp out/Mutational_Analysis/Annovar/${name}.${refg}_multianno.txt $output + +exit 0 + |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecCompare.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecCompare.xml Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,109 @@ +<?xml version="1.0"?> +<tool id="mutSpeccompare" name="MutSpec Compare" version="0.0.1"> +<description>Compare signatures with the cosine similarity method</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="3.1.2">R</requirement> + <requirement type="package" version="0.1">mutspec</requirement> +</requirements> + +<command interpreter="bash"> + mutspecCompare_wrapper.sh + $newsign + $output + #if $refSignatureSource.source == "fromtable": + \$SCRIPT_PATH/Frequency-COSMICv72-Hupki.txt + #else + ${refSignatureSource.h_publish} + #end if +</command> + +<inputs> + <conditional name="refSignatureSource"> + <param name="source" type="select" label="Reference signatures" help="You may select the provided file that includes published signatures (see details further below) or your own reference file"> + <option value="fromtable">Use COSMICv72_Hupki2014</option> + <option value="history">Use one from my history</option> + </param> + <when value="fromtable"> + <options from_data_table="published_signature_matrice" /> + </when> + <when value="history"> + <param name="h_publish" type="data" format="tabular" label="Select a file from my history" help="Matrix correctly formated (see details further below)"/> + </when> + </conditional> + + <param name="newsign" type="data" format="html" label="Newly identified signature" help="Select an output of the tool MutSpec-NMF"/> + +</inputs> + +<outputs> + <data name="output" format="html" label="Similarity_Matrix on dataset ${newsign.name}" /> +</outputs> + + +<help> + +**What it does** + +Compare two matrices containing published and newly identified mutation signatures using the `cosine similarity`__ method as already used by `Alexandrov et al. 2013`__, `Olivier et al. 2014`__ or `Schulze et al. 2015`__ + +.. __: http://en.wikipedia.org/wiki/Cosine_similarity + +.. __: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3588146/ + +.. __: http://www.nature.com/srep/2014/140327/srep04482/full/srep04482.html + +.. __: http://www.nature.com/ng/journal/v47/n5/fig_tab/ng.3252_SF3.html + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +A HTML page displaying a heatmap representing the similarity between the new signatures and the published ones. + +Values close to 1 (red) indicate a high similarity between the signatures. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Published signatures** + +The reference signatures matrix (COSMICv72-Hupki2014 matrix) includes + +1. The 30 signatures published in `COSMIC database, v72`__ + +2. The 4 experimental signatures obtained in mouse cells for AA, MNNG, BaP and AID that were published in `Olivier et al. 2014`__ + + +.. __: http://cancer.sanger.ac.uk/cosmic/signatures + +.. __: http://www.nature.com/srep/2014/140327/srep04482/full/srep04482.html + + + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example** + +Matrix of known signatures + ++-------------------+---------------+-----------------------+--------------+--------------+ +| Substitution Type | Trinucleotide | Somatic Mutation Type | Signature 1 | Signature 2 | ++===================+===============+=======================+==============+==============+ +| C>A | ACA | A[C>A]A | 0.0110983262 | 0.0006827082 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | ACC | A[C>A]C | 0.0091493407 | 0.0006191072 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | ACG | A[C>A]G | 0.0014900705 | 0.000099279 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | ACT | A[C>A]T | 0.0062338852 | 0.0003238914 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | CCA | C[C>A]A | 0.0065958701 | 0.000677445 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | CCC | C[C>A]C | 0.0073423678 | 0.000213681 + ++-------------------+---------------+-----------------------+--------------+--------------+ + + +</help> + +</tool> \ No newline at end of file |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecCompare_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecCompare_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,29 @@ +#!/bin/bash + +newsign=$1 +html=$2 +ref=$3 + +output_dir=${html%%.*}_files + +matrix=${newsign%.*}_files/NMF/Files/MatrixW-Normto100.txt + +mkdir $output_dir + +Rscript --no-save $SCRIPT_PATH/R/compareSignature_Galaxy.r $ref $matrix $output_dir 2>&1 + +# Convert the image into png format +cd $output_dir + +echo "<html><body>" >> $html +echo "<center> <h2> Cosine similarity comparison </h2> </center>" >> $html + +echo "<table>" >> $html +echo "<tr> <td> <center> <br/> <a href="Similarity_Matrix.txt">Similarity_Matrix.txt</a> </center> </td> </tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="Similarity_Matrix.png">" >> $html +echo "<img width="1000" src="Similarity_Matrix.png" /></a></td>" >> $html +echo "</tr>" >> $html +echo "</table>" >> $html + +exit 0 |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecFilter.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecFilter.pl Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,378 @@\n+# !/usr/bin/perl\n+\n+#-----------------------------------#\n+# Author: Maude #\n+# Script: mutspecFilter.pl #\n+# Last update: 18/03/16 #\n+#-----------------------------------#\n+\n+use strict;\n+use warnings;\n+use Getopt::Long;\n+use Pod::Usage;\n+use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\\.[^.]*/);\n+use File::Path;\n+\n+################################################################################################################################################################################\n+#\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFilter an Annotaed file with Annovar\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t #\n+################################################################################################################################################################################\n+\n+our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.\n+our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome.\n+our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use.\n+our ($listAVDB) = "empty"; # Text file with the list Annovar databases.\n+our ($dir) = "";\n+\n+GetOptions(\'dir|d=s\'=>\\$dir,\'verbose|v\'=>\\$verbose, \'help|h\'=>\\$help, \'man|m\'=>\\$man, \'dbSNP=i\'=>\\$dbSNP_value, \'segDup\'=>\\$segDup, \'esp\'=>\\$esp, \'thG\'=>\\$thG, \'outfile|o=s\' => \\$output, \'refGenome=s\'=>\\$refGenome, \'pathAVDBList=s\' => \\$listAVDB) or pod2usage(2);\n+\n+our ($input) = @ARGV;\n+\n+pod2usage(-verbose=>1, -exitval=>1, -output=>\\*STDERR) if ($help);\n+pod2usage(-verbose=>2, -exitval=>1, -output=>\\*STDERR) if ($man);\n+pod2usage(-verbose=>0, -exitval=>1, -output=>\\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script\n+pod2usage(-verbose=>0, -exitval=>1, -output=>\\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)\n+\n+\n+\n+# If the dbSNP value is not equal to zero filter using the dbSNP column specify\n+our $dbSNP = 0;\n+if($dbSNP_value > 0) { $dbSNP = 1; }\n+\n+\n+############ Check flags ############\n+if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }\n+\n+# Zero databases is specified\n+if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) )\n+{\n+\tprint STDERR "There is no databases selected for filtering against!!!\\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\\n";\n+\texit;\n+}\n+\n+\n+\n+############ Recover the name of the databases to filter against ############\n+my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", "");\n+my @tab_protocol = ();\n+\n+if( ($segDup == 1) || ($esp == 1) || ($thG == 1) )\n+{\n+\t### Recover the name of the column\n+\tmy $protocol = "";\n+\tExtractAVDBName($listAVDB, \\$protocol);\n+\t@tab_protocol = split(",", $protocol);\n+\n+\tfor(my $i=0; $i<=$#tab_protocol; $i++)\n+\t{\n+\t\tif($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }\n+\t\telsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; }\n+\t\telsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; }\n+\t}\n+}\n+\n+\n+############ Filter the file ############\n+filterAgainstPublicDB();\n+\n+\n+print STDOUT "\\tFilter selected\\tdbSNP = ".$dbSNP."\\tsegDup = ".$segDup."\\tesp = ".$esp."\\tthG = ".$thG."\\n";\n+\n+\n+sub filterAgainstPublicDB\n+{\n+\topen(FILTER, ">", "$output") or die "$!: $output\\n";\n+\n+\topen(F1, $input) or die "$!: $input\\n";\n+\tmy $header = <F1>; print FILTER $header;\n+\twhile(<F1>)\n+\t{\n+\t\t$_ =~ s/[\\r\\n]+$//;\n+\t\tmy @tab = split("\\t", $_);\n+\n+\t\tmy ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0);\n+\n+\t\tif($segDup == 1)\n+\t\t{\n+\t\t\tmy $segDup_value = r'..b'\n+\t\t if($name_of_column_NB eq "toto") { next; }\n+\t\t else { return $name_of_column_NB; }\n+\t\t}\n+\t\tif($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn\'t exits in the input file $input!!!!!\\n"; exit; }\n+\t}\n+\t# Only one name is pass\n+\telse\n+\t{\n+\t\topen(FT,$input) or die "$!: $input\\n";\n+\t # For having the name of the columns\n+\t my $search_header = <FT>; $search_header =~ s/[\\r\\n]+$//; my @tab_search_header = split("\\t",$search_header);\n+\t close FT;\n+\t # The number of the column\n+\t my $name_of_column_NB = "toto";\n+\t for(my $i=0; $i<=$#tab_search_header; $i++)\n+\t {\n+\t if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }\n+\t }\n+\t if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn\'t exits in the input file $input!!!!!\\n"; exit; }\n+\t else { return $name_of_column_NB; }\n+\t}\n+}\n+\n+=head1 NAME\n+\n+mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)\n+\n+=head1 SYNOPSIS\n+\n+\tmutspecFilter.pl [arguments] <query-file>\n+\n+ <query-file> an annotated file\n+\n+ Arguments:\n+ -h, --help print help message\n+ -m, --man print complete documentation\n+ -v, --verbose use verbose output\n+\t\t\t\t\t\t\t\t\t --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file\n+\t\t\t\t\t\t\t\t\t --segDup filter against genomic duplicate database\n+\t\t\t\t\t\t\t\t\t --esp filter against Exome Sequencing Project database (only for human)\n+\t\t\t\t\t\t\t\t\t --thG filter against 1000 genome database (onyl for human)\n+\t\t\t -o, --outfile <string> name of output file\n+\t\t\t --refGenome reference genome to use\n+\t\t\t --pathAVDBList path to the list of Annovar databases installed\n+\n+\n+Function: Filter out variants present in public databases\n+\n+ Example: # Filter against dbSNP\n+ \t\t\t\t\tmutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input\n+\n+ \t\t\t\t\t# Filter against the four databases\n+ \t\t\t\t\tmutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input\n+\n+\n+ Version: 03-2016 (March 2016)\n+\n+\n+=head1 OPTIONS\n+\n+=over 8\n+\n+=item B<--help>\n+\n+print a brief usage message and detailed explanation of options.\n+\n+=item B<--man>\n+\n+print the complete manual of the program.\n+\n+=item B<--verbose>\n+\n+use verbose output.\n+\n+=item B<--dbSNP>\n+\n+Remove all the variants presents in the dbSNP databases\n+Specify the number of column containing the annotation\n+For human and mouse genome\n+\n+=item B<--segDup>\n+\n+Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database\n+For human and mouse genome\n+\n+=item B<--esp>\n+\n+Remove all the variants with a frequency greater than 0.001 in Exome sequencing project\n+For human genome only\n+\n+=item B<--thG>\n+\n+Remove all the variants with a frequency greater than 0.001 in 1000 genome database\n+\n+=item B<--refGenome>\n+\n+the reference genome to use, could be hg19 or mm9.\n+\n+=item B<--outfile>\n+\n+the name of the output file\n+\n+=item B<--pathAVDBList>\n+\n+the path to a texte file containing the list of the Annovar databases installed.\n+\n+=back\n+\n+=head1 DESCRIPTION\n+\n+mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)\n+\n+=cut\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecFilter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecFilter.xml Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,97 @@ +<tool id="MutSpecfilter" name="MutSpec Filter" version="0.1" hidden="false"> +<description>Filter out variants present in public databases</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> +</requirements> + +<command interpreter="perl"> + mutspecFilter.pl + --dir \$SCRIPT_PATH + $segDup + $esp + $thG + #if $FilterdbSNP.dbSNP == "true": + --dbSNP ${FilterdbSNP.column} + #else + --dbSNP 0 + #end if + --refGenome ${refGenome} + --outfile $output + $input +</command> + +<inputs> + <param name="input" type="data" format="txt" label="Input file"/> + + <param name="refGenome" type="select" label="Reference genome" help="All your data should have been annotated with the selected genome"> + <options from_data_table="annovar_index" /> + </param> + + <conditional name="FilterdbSNP"> + <param name="dbSNP" type="boolean" checked="true" truevalue="true" label="Filter against dbSNP database" help="Remove variants with a RS number" /> + <when value="true"> + <param name="column" type="data_column" data_ref="input" label="Select the dbSNP column for filtering" use_header_names="true" help="Select a column name snp or snpNonFlagged" /> + </when> + </conditional> + + + <param name="segDup" type="boolean" checked="true" truevalue="--segDup" falsevalue="" label="Filter against SegDup database" help="Remove variants present at >= 0.9 frequency in the genomic duplicate segments database" /> + <param name="esp" type="boolean" checked="true" truevalue="--esp" falsevalue="" label="Filter against the ESP database" help="Remove variants present at frequency > 0.001 in the Exome Sequencing Project database (only valid for human genomes)" /> + <param name="thG" type="boolean" checked="true" truevalue="--thG" falsevalue="" label="Filter against the 1000g database project" help="Remove variants present at frequency > 0.001 in the 1000 genome database (only valid for human genomes)" /> +</inputs> + +<outputs> + <data type="data" name="output" format="tabular" label="${input.name.split(' ')[0]} filtered" /> +</outputs> + +<help> + +**What it does** + +Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above). + +.. class:: warningmark + +The databases ESP and 1000 genome can be used only for human genomes + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input** + +.. class:: warningmark + +Tab delimited text files generated by MutSpec-Annot tool. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +Tab delimited text file filtered for variants considered as neutral polymorphisms. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example** + +Filter the following file:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups snp138 1000g2014oct_all esp6500si_all Strand context Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2 + chr7 121717919 121717920 - G exonic AASS frameshift insertion AASS:NM_005763:exon23:c.2634dupC:p.A879fs NA rs147476318 NA NA - GCG chr7 121717919 121717920 - G + chr1 230846235 230846235 T A exonic AGT nonsynonymous SNV AGT:NM_000029:exon2:c.A362T:p.H121L NA NA NA NA - GTG chr1 230846235 230846235 T A + chr14 33290999 33290999 A G exonic AKAP6 nonsynonymous SNV AKAP6:NM_004274:exon13:c.A3980G:p.D1327G NA NA NA NA + GAC chr14 33290999 33290999 A G + chr12 8082458 8082458 C T exonic SLC2A3 nonsynonymous SNV SLC2A3:NM_006931:exon6:c.G683A:p.R228Q NA rs200481428 0.000199681 NA - CCG chr12 8082458 8082458 C T + chr4 70156391 70156391 T C exonic UGT2B28 nonsynonymous SNV UGT2B28:NM_053039:exon5:c.T1172C:p.V391A score=0.949699;Name=chr4:70035680 NA 0.000199681 NA + GTA chr4 70156391 70156391 T C + +Will produce:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups snp138 1000g2014oct_all esp6500si_all Strand context Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2 + chr1 230846235 230846235 T A exonic AGT nonsynonymous SNV AGT:NM_000029:exon2:c.A362T:p.H121L NA NA NA NA - GTG chr1 230846235 230846235 T A + chr14 33290999 33290999 A G exonic AKAP6 nonsynonymous SNV AKAP6:NM_004274:exon13:c.A3980G:p.D1327G NA NA NA NA + GAC chr14 33290999 33290999 A G + chr4 70156391 70156391 T C exonic UGT2B28 nonsynonymous SNV UGT2B28:NM_053039:exon5:c.T1172C:p.V391A score=0.949699;Name=chr4:70035680 NA 0.000199681 NA + GTA chr4 70156391 70156391 T C + + + +</help> + +</tool> |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecNmf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecNmf.xml Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,108 @@ +<?xml version="1.0"?> +<tool id="mutSpecnmf" name="MutSpec NMF" version="0.0.1"> +<description>Extract mutation signatures with the Non negative Matrix Factorization algorithm</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> + <requirement type="package" version="3.1.2">R</requirement> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="0.1">mutspec</requirement> +</requirements> + +<command interpreter="bash"> + mutspecNmf_wrapper.sh + $html_file + "--nbSign $nbsign" + ${refGenomeSource.source} + #if $refGenomeSource.source == "html": + ${refGenomeSource.reportHTML} + #else + ${refGenomeSource.matrix} + #end if +</command> + +<inputs> + <conditional name="refGenomeSource"> + <param name="source" type="select" label="Input a MutSpec Stats report or a matrix" help="You may select either a report generated by MutSpec-Stats or a tab-delimited text matrix"> + <option value="html">Dataset generated by the tool MutSpec-Stats</option> + <option value="tab">Tab-delimited matrix</option> + </param> + <when value="html"> + <param name="reportHTML" type="data" format="html" label="Input dataset" help="Select a report generated by the MutSpec-Stats tool"/> + </when> + <when value="tab"> + <param name="matrix" type="data" format="tabular" label="Input matrix" help="Select a matrix formatted as shown further below"/> + </when> + </conditional> + <param name="nbsign" type="text" value="2" label="Number of expected signatures" help="min=2" /> +</inputs> + +<outputs> + <data name="html_file" format="html" label="NMF result on ${on_string} ($nbsign signatures)" /> +</outputs> + +<help> + +**What it does** + +Extract mutation signatures composed of 96 SBS types (6 SBS types in their trinucleotide sequence context) using the non-negative matrix (`NMF`__) factorisation algorithm of Brunet with the Kullback-Leibler divergence penalty implemented in a `R package`__. + +.. __: http://www.nature.com/nature/journal/v401/n6755/full/401788a0.html +.. __: http://www.biomedcentral.com/1471-2105/11/367 + + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input formats** + +The tool accepts a HTML report produces by the tool MutSpec-Stats or a matrix of mutation count in a tab-delimited text file format (see example below). + +.. class:: warningmark + +If the input is a matrix of mutation count, the sum of mutation counts for each row should be not null. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +Matrices and graphs representing the composition of the mutation signatures found by NMF (Matrix W) and the contributions of each sample to the signatures (Matrix H). The tool also produces a matrice that can be used with the tool MutSpec-compare for comparing the identified signatures with known signatures. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example: matrix of mutation count (96 rows + a header with the samples names)** + ++--------+----------+----------+----------+ +| | Sample_1 | Sample_2 | Sample_3 | ++========+==========+==========+==========+ +|A[C>A]A | 4 | 3 | 1 | ++--------+----------+----------+----------+ +|A[C>T]A | 2 | 1 | 0 | ++--------+----------+----------+----------+ +|A[C>G]A | 13 | 2 | 1 | ++--------+----------+----------+----------+ +|A[T>A]A | 10 | 3 | 6 | ++--------+----------+----------+----------+ +|A[T>C]A | 9 | 6 | 1 | ++--------+----------+----------+----------+ +|A[T>G]A | 2 | 1 | 0 | ++--------+----------+----------+----------+ +| ... | ++--------+----------+----------+----------+ +|T[C>A]T | 5 | 2 | 2 | ++--------+----------+----------+----------+ +|T[C>G]T | 5 | 2 | 0 | ++--------+----------+----------+----------+ +|T[C>T]T | 11 | 4 | 2 | ++--------+----------+----------+----------+ +|T[T>A]T | 3 | 0 | 5 | ++--------+----------+----------+----------+ +|T[T>C]T | 39 | 17 | 1 | ++--------+----------+----------+----------+ +|T[T>G]T | 12 | 8 | 1 | ++--------+----------+----------+----------+ + + +</help> + +</tool> |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecNmf_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecNmf_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,94 @@ +#!/bin/bash + +######################################### +### SPECIFY THE NUMBER OF CPU ### +######################################### +cpu=1 + + + + +html=$1;shift +parameters=$1;shift +source=$1;shift +input=$1 + +if [[ $source == "html" ]] +then input=${input%%.*}_files/Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt +fi + +output_dir=${html%%.*}_files +mkdir $output_dir + +Rscript $SCRIPT_PATH/R/somaticSignature_Galaxy.r $parameters --cpu $cpu --input $input --output $output_dir 2>&1 + + +## Test the existence of the files and graphs produced by NMF +if [[ ! -e "$output_dir/NMF/Files/MatrixW-Normto100.txt" ]]; then + >&2 echo "error" + exit +fi + + +echo "<html><body>" >> $html +echo "<center> <h2> NMF Mutational signatures analysis </h2> </center>" >> $html + + +echo "<table>" >> $html +echo "<tr> <br/> <th><h3>Heatmap of the mixture coefficient matrix</h3></th> </tr>" >> $html +echo "<tr> <td> <center> <br/> <a href="NMF/Files/Cluster_MixtureCoeff.txt">Cluster_MixtureCoeff.txt</a> </center> </td> </tr>" >> $html +echo "<tr>" >> $html + +if [[ ! -e "$output_dir/NMF/Figures/Heatmap_MixtureCoeff.png" ]]; then + echo "WARNING: NMF package can't plot the heatmap when the samples size is above 300. <br/>" >> $html +else + echo "<td> <center> <a href="NMF/Figures/Heatmap_MixtureCoeff.png">" >> $html + echo "<img src="NMF/Figures/Heatmap_MixtureCoeff.png" /></a> <center> </td>" >> $html +fi +echo "</tr>" >> $html +echo "</table>" >> $html + +echo "<br/><br/>" >> $html + +echo "<table>" >> $html +echo "<tr>" >> $html +echo "<th><h3>Signature composition</h3></th>" >> $html +echo "</tr>" >> $html +echo "<tr><td> <center> <a href="NMF/Files/MatrixW-Normto100.txt">Composition somatic mutation (input matrix for the tool MutSpec-Compare)</a><center></td></tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="NMF/Figures/CompositionSomaticMutation.png">" >> $html +echo "<img width="1000" src="NMF/Figures/CompositionSomaticMutation.png" /></a></td>" >> $html +echo "</tr> " >> $html +echo "</table>" >> $html +echo "<br/><br/>" >> $html + +echo "<table>" >> $html +echo "<tr>" >> $html +echo "<th><h3>Sample contribution to signatures</h3></th>" >> $html +echo "</tr>" >> $html +echo "<tr><td> <center> <a href="NMF/Files/MatrixH-Inputggplot2.txt">Contribution mutation signature matrix</a></center></td></tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="NMF/Figures/ContributionMutationSignature.png">" >> $html +echo "<img width="700" src="NMF/Figures/ContributionMutationSignature.png" /></a></td>" >> $html +echo "</tr> " >> $html +echo "</table>" >> $html +echo "<br/><br/>" >> $html + + +echo "<table>" >> $html +echo "<tr>" >> $html +echo "<th><h3>Average contributions of each signatures in each cluster</h3></th>" >> $html +echo "</tr>" >> $html +echo "<tr><td> <center> <a href="NMF/Files/Average_ContriByCluster.txt">Average contributions</a></center></td></tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="NMF/Figures/Average_ContriByCluster.png">" >> $html +echo "<img width="700" src="NMF/Figures/Average_ContriByCluster.png" /></a></td>" >> $html +echo "</tr> " >> $html +echo "</table>" >> $html +echo "<br/><br/>" >> $html + +echo "<br/><br/><br/><br/>" >> $html + + + +exit 0 |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecSplit.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecSplit.pl Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| @@ -0,0 +1,64 @@ +# !/usr/bin/perl + +#-----------------------------------# +# Author: Vincent # +# Script: mutspecSplit.pl # +# Last update: 01/07/14 # +#-----------------------------------# + + +use strict; +use warnings; +use Getopt::Long; + +our $file=""; +our $column=""; +our $path=""; +our $key=""; + + +GetOptions('file|f=s' =>\$file, + 'key|k=s' =>\$key, + 'column|i=s' =>\$column, + 'path|p=s' =>\$path); + + +mkdir ("outputFiles") or die ("Erreur creation repertoire\n"); +# print $file,"\n", $key,"\n", $column,"\n", $path,"\n"; exit; + +my %tab; +if ($column==0) {$column++;} +$column--; + +open(FILE, "$file") or die "cannot open $file\n"; + +$_=<FILE>; #skip headers +chomp; +my @line = split(/\t/,$_); +my $headers = join("\t", @line[0..($column-1),($column+1)..$#line]); + +while(<FILE>){ + chomp; + my @line = split(/\t/,$_); + #if (!exists($tab{$line[$column]})) { $tab{$line[$column]}=[]; } + #push( @{ $tab{$line[$column]} }, join("\t", @line[0..($column-1),($column+1)..$#line]) ); + my $tmp = join("\t", @line[0..($column-1),($column+1)..$#line]) ; + my $id = $line[$column]; + push( @{ $tab{$id} }, $tmp); +} + + +while( my ($name,$lines) = each(%tab) ) { + my $output="outputFiles/$name"; + #my $output="primary_$key" . "_$name" . "_visible_tabular"; + # my $output=$name; + open(FILE, ">$output") or die "cannot create file $output \n"; + print FILE $headers."\n"; + foreach my $line (@{$lines}){ + print FILE "$line\n"; + } + close FILE; +} + +my $list=`ls outputFiles/*`; +print ($list); |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecSplit.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecSplit.xml Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,85 @@ +<tool id="mutSpecsplit" name="MutSpec Split" version="0.1" hidden="false" force_history_refresh="True"> +<description>Split a tabular file by sample ID</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> +</requirements> + +<command interpreter="perl"> + mutspecSplit.pl -f $input -c $column +</command> + +<inputs> + <param name="input" type="data" format="tabular" label="Input file" help="If using the batch mode (multiple datasets), all files must contain the same sample id column. The tool doesn't support dataset list as input !" /> + <param name="column" type="data_column" data_ref="input" label="Split by" use_header_names="true"/> +</inputs> + +<outputs> + <collection name="splitted_output" type="list" label="collection"> + <discover_datasets pattern="__name__" ext="tabular" directory="outputFiles"/> + </collection> +</outputs> + +<help> + +**What it does** + +This tool splits a file into several files based on the content of the selected column. +It can be used for example to split a file that contains data on 10 samples into 10 files using the same sample ID column. +The resulting files are saved into a dataset list/collection. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input** + +One or multiple tab delimited text files. + +If multiple files are selected, they should all have the same column on which you want to do the split. + +.. class:: warningmark + +The tool doesn't support dataset list as input !!! + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +A dataset list containing tab delimited text files resulting from splitting the input file(s). + +.. class:: warningmark + +If a large number of file are generated, you'll need to refresh the history to see all files included in the dataset list. The entire list of file may still not be correctly displayed due to a known bug in Galaxy that may be fixed in future versions. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example** + +Split by sample ID the following file:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups 1000g2012apr_all snp137 esp6500si_all cosmic67 Strand Context Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic Sample_name Pubmed_PMID Age Comments + chr12 82752552 82752552 G A exonic METTL25 nonsynonymous SNV NM_032230:c.G208A:p.E70K NA NA NA NA NA + GTCGGAGACGGAGGCCCTGCC chr12 82752552 G A APA29 23913001 2 NA + chr11 86663436 86663436 C A exonic FZD4 nonsynonymous SNV NM_012193:c.G362T:p.C121F NA NA NA NA NA - GACTGAAAGACACATGCCGCC chr11 86663436 C A APA12 21311022 34 Tissue Remark Fixed:Remark + chr12 57872994 57872994 G A exonic ARHGAP9 nonsynonymous SNV NM_001080157:c.C196T:p.R66C NA NA NA 0.000077 ID=COSM431582;OCCURENCE=2(breast) - GCTTCTAGGCGTCTTGCCAAC chr12 57872994 G A APA12 21311022 34 Tissue Remark Fixed:Remark + + +Will create a dataset list with two dataset: + + +APA29:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups 1000g2012apr_all snp137 esp6500si_all cosmic67 Strand Context Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic Sample_name Pubmed_PMID Age Comments + chr12 82752552 82752552 G A exonic METTL25 nonsynonymous SNV NM_032230:c.G208A:p.E70K NA NA NA NA NA + GTCGGAGACGGAGGCCCTGCC chr12 82752552 G A APA29 23913001 2 NA + + +APA12:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups 1000g2012apr_all snp137 esp6500si_all cosmic67 Strand Context Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic Sample_name Pubmed_PMID Age Comments + chr11 86663436 86663436 C A exonic FZD4 nonsynonymous SNV NM_012193:c.G362T:p.C121F NA NA NA NA NA - GACTGAAAGACACATGCCGCC chr11 86663436 C A APA12 21311022 34 Tissue Remark Fixed:Remark + chr12 57872994 57872994 G A exonic ARHGAP9 nonsynonymous SNV NM_001080157:c.C196T:p.R66C NA NA NA 0.000077 ID=COSM431582;OCCURENCE=2(breast) - GCTTCTAGGCGTCTTGCCAAC chr12 57872994 G A APA12 21311022 34 Tissue Remark Fixed:Remark + + + +</help> + +</tool> |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecStat.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecStat.pl Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,3352 @@\n+#!/usr/bin/env perl\n+\n+#-----------------------------------#\n+# Author: Maude #\n+# Script: mutspecStat.pl #\n+# Last update: 09/04/16 #\n+#-----------------------------------#\n+\n+use strict;\n+use warnings;\n+use Getopt::Long;\n+use Pod::Usage;\n+use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\\.[^.]*/);\n+use File::Path;\n+use Statistics::R;\n+use Spreadsheet::WriteExcel;\n+\n+our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.\n+our ($refGenome, $output, $folder_temp, $path_R_Scripts, $path_SeqrefGenome) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path for saving the temporary files; The path to R scripts; The path to the fasta reference sequences\n+our ($poolData, $oneReportPerSample) = (2, 2); # If a folder is pass as input file pool all the data and generate the report on the pool and for each samples; # Generate one report for each samples\n+\n+\n+GetOptions(\'verbose|v\'=>\\$verbose, \'help|h\'=>\\$help, \'man|m\'=>\\$man, \'refGenome=s\'=>\\$refGenome, \'outfile|o=s\' => \\$output, \'pathTemporary|temp=s\' => \\$folder_temp, \'pathRscript=s\' => \\$path_R_Scripts, \'pathSeqRefGenome=s\' => \\$path_SeqrefGenome, \'poolData\' => \\$poolData, \'reportSample\' => \\$oneReportPerSample) or pod2usage(2);\n+\n+our ($input) = @ARGV;\n+\n+pod2usage(-verbose=>1, -exitval=>1, -output=>\\*STDERR) if ($help);\n+pod2usage(-verbose=>2, -exitval=>1, -output=>\\*STDERR) if ($man);\n+pod2usage(-verbose=>0, -exitval=>1, -output=>\\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script\n+pod2usage(-verbose=>0, -exitval=>1, -output=>\\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)\n+\n+\n+\n+######################################################################################################################################################\n+#\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tGLOBAL VARIABLES\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t #\n+######################################################################################################################################################\n+# Recover the current path\n+our $pwd = `pwd`;\n+chomp($pwd);\n+\n+# Path to R scripts\n+our $pathRScriptTxnSB = "$path_R_Scripts/R/transciptionalStrandBias.r";\n+our $pathRScriptMutSpectrum = "$path_R_Scripts/R/mutationSpectra_Galaxy.r";\n+\n+our $folderMutAnalysis = "";\n+our @pathInput = split("/", $input);\n+\n+# Hash table with the length of each chromosomes\n+our %chromosomes;\n+\n+######################################################################################################################################################\n+#\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMAIN \t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t #\n+######################################################################################################################################################\n+# Check the presence of the flags and create the output and temp directories\n+CheckFlags();\n+\n+# Retrieve chromosomes length\n+checkChrDir();\n+\n+\n+print "-----------------------------------------------------------------\\n";\n+print "-----------------Report Mutational Analysis----------------------\\n";\n+print"-----------------------------------------------------------------\\n";\n+\n+# First check if the file is annotated or not\n+CheckAnnotationFile($input);\n+\n+# Calculate the statistics and generate the report\n+my @colInfoAV = qw(Chr Start Ref Alt);\n+ReportMutDist($input, $folderMutAnalysis, $folder_temp, \\@colInfoAV, $refGenome);\n+\n+# Remove the temporary directory\n+rmtree($folder_temp);\n+\n+\n+######################################################################################################################################################\n+#\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFUNCTIONS\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t #\n+###########'..b' $wb->set_custom_color(45, 255, 192, 203);# T:A>G:C in pink\n+\t}\n+\tsub BackgroundColor\n+\t{\n+\t\tmy ($wb, $bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink) = @_;\n+\n+\t\t$$bgColor_blue = $wb->set_custom_color(48, 0, 0, 204);\n+\t\t$$bgColor_black = $wb->set_custom_color(49, 0, 0, 0);\n+\t\t$$bgColor_red = $wb->set_custom_color(50, 255, 0, 0);\n+\t\t$$bgColor_gray = $wb->set_custom_color(51, 205, 205, 205);\n+\t\t$$bgColor_green = $wb->set_custom_color(52, 0, 204, 51);\n+\t\t$$bgColor_pink = $wb->set_custom_color(53, 255, 192, 203);\n+\t}\n+}\n+\n+\n+sub recoverNumCol\n+{\n+\tmy ($input, $name_of_column) = @_;\n+\n+ open(F1,$input) or die "recoverNumCol: $!: $input\\n";\n+ # For having the name of the columns\n+ my $search_header = <F1>; $search_header =~ s/[\\r\\n]+$//; my @tab_search_header = split("\\t",$search_header);\n+ close F1;\n+ # The number of the column\n+ my $name_of_column_NB = "toto";\n+ for(my $i=0; $i<=$#tab_search_header; $i++)\n+ {\n+ if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }\n+ }\n+ if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn\'t exits in the input file $input!!!!!\\n"; exit; }\n+ else { return $name_of_column_NB; }\n+}\n+\n+\n+\n+\n+=head1 NAME\n+\n+mutSpec-Stat\n+\n+=head1 SYNOPSIS\n+\n+\tmutSpecstat.pl [arguments] <query-file>\n+\n+ <query-file> can be a folder with multiple VCF or a single VCF\n+\n+ Arguments:\n+ -h, --help print help message\n+ -m, --man print complete documentation\n+ -v, --verbose use verbose output\n+ --refGenome the reference genome to use (human, mouse or rat genomes)\n+ -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file\n+ -temp --pathTemporary <string> the path for saving the temporary files\n+ --pathSeqRefGenome the path to the fasta reference sequences\n+ --poolData generate the pool of all the samples (optional)\n+ --reportSample generate a report for each sample (optional)\n+\n+\n+Function: automatically run a pipeline and calculate various statistics on mutations\n+\n+ Example: mutSpecstat.pl --refGenome hg19 --outfile output_directory --temp path_to_temporary_directory --pathRscript path_to_R_scripts --pathSeqRefGenome path_fasta_ref_seq --poolData --reportSample input\n+\n+ Version: 04-2016 (April 2016)\n+\n+\n+=head1 OPTIONS\n+\n+=over 8\n+\n+=item B<--help>\n+\n+print a brief usage message and detailed explanation of options.\n+\n+=item B<--man>\n+\n+print the complete manual of the program.\n+\n+=item B<--verbose>\n+\n+use verbose output.\n+\n+=item B<--refGenome>\n+\n+the reference genome to use, could be human, mouse or rat genomes.\n+\n+=item B<--outfile>\n+\n+the directory of output file names. If it is nor specify the same directory as the input file is used.\n+\n+=item B<--pathTemporary>\n+\n+the path for saving temporary files generated by the script.\n+If any is specify a temporary folder is created in the same directory where the script is running.\n+Deleted when the script is finish\n+\n+=item B<--pathSeqRefGenome>\n+\n+The path to the fasta reference sequences\n+\n+=item B<--poolData only for the report>\n+\n+calculate the statistics on the pool of all the data pass in input\n+\n+=item B<--reportSample only for the report>\n+\n+generate a report for each samples\n+\n+=head1 DESCRIPTION\n+\n+mutSpecstat is a perl script for calculated various statistics on mutations\n+An Excel report containing the mutation type distribution per functional region, the strand bias and the sequence context on genomic and coding sequence is created.\n+The different statistics are illustrated using ggplot2.\n+\n+=cut\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecStat.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecStat.xml Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,157 @@ +<tool id="mutSpecStat" name="MutSpec Stat" version="0.1" hidden="false"> +<description>Calculate various statistics on mutations</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> + <requirement type="package" version="3.3">weblogo</requirement> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="3.1.2">R</requirement> + <requirement type="package" version="0.1">mutspec</requirement> +</requirements> + +<command interpreter="bash"> + mutspecStat_wrapper.sh + $html + ${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/ + #if $estimateSignature.estimSign == "true": + ${estimateSignature.estimT} + #else + 0 + #end if + + "--refGenome ${refGenome} --pathSeqRefGenome ${refGenome.fields.path} $pooldata $reportSample" + #import re + #for $f in $dataset_list + #set $regexp = $re.compile("\((.*)\)") + #if $regexp.search($f.name) + #set filename=$regexp.search($f.name) + "$f=${filename.group(1)}" + #else + "$f=${f.name}" + #end if + #end for +</command> + +<inputs> + <param name="dataset_list" type="data_collection" format="tabular" collection_type="list" label="Annotated Dataset List" help="Select a dataset list/collection from your history" /> + <param name="refGenome" type="select" label="Reference genome" help="All data in your dataset list should have been generated with the selected genome"> + <options from_data_table="annovar_index" /> + </param> + + <param name="pooldata" type="boolean" checked="true" truevalue="--pooldata" falsevalue="" label="Include statistics on the pooled samples" /> + <param name="reportSample" type="boolean" checked="false" truevalue="--reportSample" falsevalue="" label="Generate one output file for each sample" help="By default, one output Excel file will be generated with statistics of each sample shown in different data sheets. Setting this option to true will generate one Excel file for each sample instead. It is recommended to use this option if your dataset list contains more than 250 files as the Excel output file may be too heavy to open easily on a computer with limited RAM"/> + + <conditional name="estimateSignature"> + <param name="estimSign" type="boolean" label="Compute statistics for estimating the number of signatures" help="This option gererates different statistics that can be used to estimate the number of signatures to extract with NMF (this number should be used in the MutSpec-NMF tool"/> + <when value="true"> + <param name="estimT" type="text" value="8" label="Maximum number of signatures to compute" help="Warning: Selecting a number above 8 may not work on small datasets"/> + </when> + </conditional> + +</inputs> + +<outputs> + <data name="html" type="data" format="html" label="mutation spectra report on ${dataset_list.name}" /> +</outputs> + +<stdio> + <regex match="FutureWarning" + source="both" + level="warning" + description="FutureWarning" /> +</stdio> + +<help> + +**What it does** + +MutSpec-Stat calculates various statistics describing mutation characteristics extracted from a dataset collection, and estimate (optional) the number of signatures present in the dataset. +The statistics include overall distribution of mutations, mutation distribution for single base substitutions (SBS) by functional regions, chromosomes, or in their trinucleotide sequence context (see details below). + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input formats** + +The tool accepts a dataset list + +.. class:: infomark + +You should thus create a dataset list even when using one file (see Galaxy help to learn `how to create a dataset list`__) + +.. __: https://wiki.galaxyproject.org/Histories#Dataset_Collections + +.. class:: warningmark + +The input files must have been generated by the MutSpec-Annot tool (so they contain the required annotations). + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +MutSpec-Stat generates an html page with links to : + - an Excel file that includes all computed statistics shown in tabular and graphical formats, for each sample (one by datasheet) and for the pooled samples (optional), + - html pages for individual sample results, + - the input matrix for the tool MutSpec-NMF, + - the result of the estimation of the number of signatures (if the option "Compute statistics for estimating the number of signatures" was selected). + +The following statistics are generated: + +**Graph 1. SBS distribution** +Proportion (percent of all SBS) of each type of single base substitution (SBS). +All SBS are considered, including the ones without strand orientation annotation. + +**Table 1. Frequency and counts of all SBS** +Values corresponding to graph 1. + + +**Graph 2. Impact on protein sequence** +Impact of all mutations (SBS and Indel) on the protein sequence based on the ExonicFunc.refGene annotation. +For more details about the annotation, please visit the `Annovar web page`__ + +.. __: http://www.openbioinformatics.org/annovar/annovar_gene.html#output1 + + +**Table 2. Frequency and counts of functional impacts** +Values corresponding to graph 2. + + +**Graph 3. Stranded distribution of SBS** +Proportion (percent of all SBS with strand annotation) of the six substitution types on the transcribed and non-transcribed strand. +Only regions with strand annotation are considered. + +**Table 3. Significance of the strand biases** +The strand bias for each SBS type is calculated as the ratio of SBS on the non-transcribed (coding) versus the transcribed (non-coding) strand. +The statistical significance of the differences between the mutational frequencies on the non-transcribed and the +transcribed strand (equal to 0.5, as expected by chance) is assessed using a chi-squared test followed by the Benjamini- +Hochberg procedure for multiple testing corrections (only samples with at least 1 mutations on the non-transcribed or on the transcribed strand are considered). +Two tables are shown to display the 6 SBS types in both orientations. + + +**Table 4. SBS distribution by functional region** +Count and percentages of SBS in genomic regions based on the Func.refGene annotation. + + +**Table 5. Strand bias by functional region** +Counts of the strand bias for the 6 SBS types in different functional regions. + + +**Table 6. SBS distribution per chromosome** +Counts of SBS per chromosome for the six SBS types. +The correlation between SBS counts and chromosome size is calculated using a Pearson correlation test. + + +**Panel 1. Trinucleotide sequence context of SBS on the genomic sequence** +The trinucleotide sequence context takes into consideration the flanking base in 5' and in 3' of the SBS. +SBS counts and frequency data are shown as tables, heatmaps or bar graphs. The heatmap colors are scaled to the maximum value of the corresponding table. The bar graph is scaled to the maximum frequency value (total number of mutation by SBS type is shown in parenthesis). + + + +**Panel 2. Stranded analysis of trinucleotide sequence context of SBS** +SBS within their trinucleotide sequence context are counted on the non-transcribed and transcribed strands of the gene region they are located in. Counts and frequencies are shown as tables or bar graphs. +Only SBS with strand orientation annotation are considered in this analysis (strand annotation retrieved from RefSeq database). + + +</help> + +</tool> |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b mutspecStat_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecStat_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 |
| [ |
| b'@@ -0,0 +1,507 @@\n+#!/bin/bash\n+\n+#########################################\n+### SPECIFY THE NUMBER OF CPU ###\n+#########################################\n+cpu=1\n+\n+\n+\n+\n+\n+#########################################\n+### Recover the arguments ###\n+#########################################\n+html=$1;shift\n+len_file_path=$1;shift\n+estimSign=$1;shift\n+parameters=$1;shift\n+working_dir=`pwd`\n+\n+\n+\n+mkdir in\n+cd in\n+\n+names=$(sed \'s/\\s/_/g\' <<< $*)\n+names=$(sed \'s/_\\// \\//g\' <<< $names)\n+names=$(sed \'s/_annotated//g\' <<< $names)\n+names=$(sed \'s/_filtered//g\' <<< $names)\n+names=$(sed \'s/\\.txt_/_/\' <<< $names)\n+\n+for name in ${names}\n+do\n+ file=$(sed \'s/=/ /\' <<< $name);\n+ echo $file\n+ ln -s $file\n+done\n+cd ..\n+\n+output_dir=${html%%.*}_files\n+\n+\n+#########################################\n+### Calculates the statistics ###\n+#########################################\n+\n+perl $SCRIPT_PATH/mutspecStat.pl --outfile $output_dir \\\n+\t--temp "$working_dir/temp" \\\n+\t--pathRscript $SCRIPT_PATH \\\n+\t$parameters \\\n+\t$working_dir/in\n+\n+\n+#########################################\n+### Estimate the number of signatures ###\n+#########################################\n+if [[ $estimSign > 0 ]]; then\n+ Rscript $SCRIPT_PATH/R/estimateSign_Galaxy.r --input $output_dir/Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt --stop $estimSign --cpu $cpu --output $output_dir/Mutational_Analysis/Figures/Estimate_Number_Signatures.png 2>&1\n+fi\n+\n+\n+#########################################\n+### Create css #\n+#########################################\n+css=$output_dir/Mutational_Analysis/style.css\n+echo ".legend{position:relative}.legend .legend-hidden{display:none;position:absolute;background-color:#fff;border:3px solid #03F;padding:3px;color:#000;font-size:1em;border-radius:10px;margin-top:-40px}.legend:hover .legend-hidden{display:block}" > $css\n+\n+\n+\n+# HMTL page for the result of the tool\n+echo "<html>" >> $html\n+echo "<body>" >> $html\n+\n+if [ -d $output_dir/Mutational_Analysis/Figures ]; then\n+\n+echo "<center> <h2>Mutational spectra report summary</h2> </center>" >> $html\n+\n+echo "<br/> Download the full report in Excel" >> $html\n+\n+## One report with all the samples. Specify the full path\n+if [[ -e "$output_dir/Mutational_Analysis/Report_Mutation_Spectra.xls" ]]\n+then\n+\t# Interpreted by Galaxy so don\'t need the full path\n+\techo "<br/><a href="Mutational_Analysis/Report_Mutation_Spectra.xls">Report_Mutation_Spectra.xls</a>" >> $html\n+fi\n+## One report for each samples\n+for file in $names\n+do\n+ name=$(echo ${file}| cut -d"=" -f2)\n+ name=${name%.*}\n+\n+ # One report for each samples\n+ if [[ -e "$output_dir/Mutational_Analysis/Report_Mutation_Spectra-$name.xls" ]]\n+ then\n+ echo "<br/><a href="Mutational_Analysis/Report_Mutation_Spectra-$name.xls">Report_Mutation_Spectra-$name.xls</a>" >> $html\n+ fi\n+done\n+## One report for each samples: Pool_Data\n+if [[ $parameters =~ "--pooldata" ]]; then\n+ if [[ -e "$output_dir/Mutational_Analysis/Report_Mutation_Spectra-Pool_Data.xls" ]]; then\n+ echo "<br/><a href="Mutational_Analysis/Report_Mutation_Spectra-Pool_Data.xls">Report_Mutation_Spectra-Pool_Data.xls</a>" >> $html\n+ fi\n+fi\n+\n+\n+## Input file for NMF\n+if [[ -e "$output_dir/Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt" ]]\n+then\n+ # Interpreted by Galaxy so don\'t need the full path\n+ echo "<br/><br/> Download the input file for the tool mutSpec-NMF" >> $html\n+ echo "<br/><a href="Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt">Input_NMF_Count.txt</a><br/>" >> $html\n+fi\n+\n+## Computed statistics for estimating the number of signatures\n+if [[ $estimSign > 0 ]]; then\n+ echo "<br/> Link to the computed statistics for estimating the number of signatures <br/>" >> $html\n+ if [[ -e "$output_dir/Mutational_Analysis/Figures/Estimate_Number_Signatures.png" ]]; then\n+ outEstimateSign="$output_dir/Mutational_Analysis/EstimatingSignatures.html"\n+ touch $outEstimateSign\n+ echo '..b'ignaturePercent.txt</a> </center> </td>" >> $outfilePoolData\n+ echo "</tr><tr>" >> $outfilePoolData\n+\n+ echo "<td>" >> $outfilePoolData\n+ echo "<span class="legend"><img src="Figures/Stranded_Analysis/Pool_Data/Pool_Data-StrandedSignaturePercent.png width="1300""/>" >> $outfilePoolData\n+ echo "<span class="legend-hidden">" >> $outfilePoolData\n+ echo "<center><B>Panel 2. Stranded analysis of trinucleotide sequence context of SBS</center></B><br/>Proportion of SBS with their trinucleotide context considering the non-transcribed and transcribed strand<br/>" >> $outfilePoolData\n+ echo "</td>" >> $outfilePoolData\n+ echo "</tr>" >> $outfilePoolData\n+ echo "</table>" >> $outfilePoolData\n+\n+ echo "<br/><br/>" >> $outfilePoolData\n+\n+ #####################################################\n+ # Sequence logo generated with Weblogo3: Pool #\n+ #####################################################\n+ echo "<table>" >> $outfilePoolData\n+ echo "<h3>Sequence logo generated with Weblogo3</h3>" >> $outfilePoolData\n+ # C>A\n+ echo "<tr>" >> $outfilePoolData\n+ if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-CA-Probability.png" ]]; then\n+ echo "<td>WARNING: No sequence for C>A </br> </td>" >> $outfilePoolData\n+ else\n+ echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-CA.fa">Pool_Data-CA.fa</a><br/>" >> $outfilePoolData\n+ echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-CA-Probability.png"/><br/></td>" >> $outfilePoolData\n+ fi\n+ # C>G\n+ if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-CG-Probability.png" ]]; then\n+ echo "<td>WARNING: No sequence for C>G </br> </td>" >> $outfilePoolData\n+ else\n+ echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-CG.fa">Pool_Data-CG.fa</a><br/>" >> $outfilePoolData\n+ echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-CG-Probability.png"/><br/></td>" >> $outfilePoolData\n+ fi\n+ # C>T\n+ if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-CT-Probability.png" ]]; then\n+ echo "<td>WARNING: No sequence for C>T </br> </td>" >> $outfilePoolData\n+ else\n+ echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-CT.fa">Pool_Data-CT.fa</a><br/>" >> $outfilePoolData\n+ echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-CT-Probability.png"/><br/></td>" >> $outfilePoolData\n+ fi\n+ echo "</tr>" >> $outfilePoolData\n+\n+ # T>A\n+ echo "<tr>" >> $outfilePoolData\n+ if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-TA-Probability.png" ]]; then\n+ echo "<td>WARNING: No sequence for T>A </br> </td>" >> $outfilePoolData\n+ else\n+ echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-TA.fa">Pool_Data-TA.fa</a><br/>" >> $outfilePoolData\n+ echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-TA-Probability.png"/><br/></td>" >> $outfilePoolData\n+ fi\n+ # T>C\n+ if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-TC-Probability.png" ]]; then\n+ echo "<td>WARNING: No sequence for T>C </br> </td>" >> $outfilePoolData\n+ else\n+ echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-TC.fa">Pool_Data-TC.fa</a><br/>" >> $outfilePoolData\n+ echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-TC-Probability.png"/><br/></td>" >> $outfilePoolData\n+ fi\n+ # T>G\n+ if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-TG-Probability.png" ]]; then\n+ echo "<td>WARNING: No sequence for T>G </br> </td>" >> $outfilePoolData\n+ else\n+ echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-TG.fa">Pool_Data-TG.fa</a><br/>" >> $outfilePoolData\n+ echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-TG-Probability.png"/><br/></td>" >> $outfilePoolData\n+ fi\n+ echo "</tr>" >> $outfilePoolData\n+ echo "</table>" >> $outfilePoolData\n+\n+ echo "</body></html>" >> $outfilePoolData\n+\n+fi # End if --poolData\n+\n+fi # End if [ -d $output_dir/Mutational_Analysis/Figures ]\n+\n+echo "</body></html>" >> $html\n+\n+exit 0\n+\n' |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b tool-data/annovar_index.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/annovar_index.loc.sample Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,7 @@ +# +# Database name (value), dbkey, type, and path. +# +# +#hg19 hg19 filter /home/galaxy/annovar/hg19db/ + + |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,7 @@ +<!-- ANNOVAR files --> +<tables> +<table name="annovar_index" comment_char="#"> +<columns>value, dbkey, type, path</columns> +<file path="tool-data/annovar_index.loc" /> +</table> +</tables> |
| b |
| diff -r 000000000000 -r 8c682b3a7c5b tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Apr 19 03:07:11 2016 -0400 |
| b |
| @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + <environment_variable action="set_to" name="SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> + + <package name="perl" version="5.18.1"> + <install version="1.0"> + <actions> + <action type="setup_perl_environment"> + <repository changeset_revision="35f117d7396b" name="package_perl_5_18" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="perl" prior_installation_required="True" version="5.18.1" /> + </repository> + + <repository changeset_revision="4d2fd1413b56" name="package_r_3_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="R" version="3.0.1" /> + </repository> + + <!-- allow downloading and installing an Perl package from cpan.org--> + <package>http://search.cpan.org/CPAN/authors/id/T/TO/TODDR/IPC-Run-0.94.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/A/AB/ABIGAIL/Regexp-Common-2013031301.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/F/FA/FANGLY/Statistics-R-0.33.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/J/JM/JMCNAMARA/OLE-Storage_Lite-0.19.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/J/JM/JMCNAMARA/Spreadsheet-WriteExcel-2.40.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/D/DL/DLUX/Parallel-ForkManager-0.7.5.tar.gz</package> + </action> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5</environment_variable> + </action> + </actions> + </install> + </package> + + <package name="perl" version="5.18.1"> + <repository changeset_revision="35f117d7396b" name="package_perl_5_18" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="weblogo" version="3.3"> + <repository changeset_revision="648e4b32f15c" name="package_weblogo_3_3" owner="devteam" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="numpy" version="1.9"> + <repository changeset_revision="57b37f63cb84" name="package_python_2_7_numpy_1_9" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="R" version="3.1.2"> + <repository changeset_revision="4d2fd1413b56" name="package_r_3_1_2" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="mutspec" version="0.1"> + <repository changeset_revision="63cc1719e1aa" name="package_r_mutspec_0_1" owner="iarc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + +</tool_dependency> |