Mercurial > repos > iuc > edger
changeset 2:a1634a9c2ee1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/edger commit 910ffba48cb5f981aad1e00b77056bbbec7f9617
author | iuc |
---|---|
date | Thu, 19 Apr 2018 17:26:38 -0400 |
parents | 2a16413ec60d |
children | d79ed3ec25fe |
files | edger.R edger.xml test-data/edgeR_Mut-WT.tsv test-data/edgeR_Mut-WT_2fact.tsv test-data/edgeR_Mut-WT_2fact_anno.tsv test-data/edgeR_Mut-WT_anno.tsv test-data/edgeR_Mut-WT_filt.tsv test-data/edgeR_WT-Mut.tsv test-data/edgeR_WT-Mut_2fact_anno.tsv test-data/edgeR_normcounts.tsv test-data/edgeR_normcounts_anno.tsv test-data/out_rscript.txt |
diffstat | 12 files changed, 818 insertions(+), 81 deletions(-) [+] |
line wrap: on
line diff
--- a/edger.R Tue Jan 30 04:07:08 2018 -0500 +++ b/edger.R Thu Apr 19 17:26:38 2018 -0400 @@ -486,9 +486,9 @@ sumStatus <- summary(status) # Collect counts for differential expression - upCount[i] <- sumStatus["Up"] - downCount[i] <- sumStatus["Down"] - flatCount[i] <- sumStatus["NotSig"] + upCount[i] <- sumStatus["Up", ] + downCount[i] <- sumStatus["Down", ] + flatCount[i] <- sumStatus["NotSig", ] # Write top expressions table top <- topTags(res, n=Inf, sort.by="PValue")
--- a/edger.xml Tue Jan 30 04:07:08 2018 -0500 +++ b/edger.xml Thu Apr 19 17:26:38 2018 -0400 @@ -1,10 +1,11 @@ -<tool id="edger" name="edgeR" version="3.20.7.0"> +<tool id="edger" name="edgeR" version="3.20.7.1"> <description> Perform differential expression of count data </description> <requirements> <requirement type="package" version="3.20.7">bioconductor-edger</requirement> + <requirement type="package" version="3.34.9">bioconductor-limma</requirement> <requirement type="package" version="0.2.15">r-rjson</requirement> <requirement type="package" version="1.20.0">r-getopt</requirement> <requirement type="package" version="1.4.30">r-statmod</requirement> @@ -13,7 +14,7 @@ </requirements> <version_command><![CDATA[ -echo $(R --version | grep version | grep -v GNU)", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scales version" $(R --vanilla --slave -e "library(scales); cat(sessionInfo()\$otherPkgs\$scales\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rjson version" $(R --vanilla --slave -e "library(rjson); cat(sessionInfo()\$otherPkgs\$rjson\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", getopt version" $(R --vanilla --slave -e "library(getopt); cat(sessionInfo()\$otherPkgs\$getopt\$Version)" 2> /dev/null | grep -v -i "WARNING: ") +echo $(R --version | grep version | grep -v GNU)", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", limma version" $(R --vanilla --slave -e "library(limma); cat(sessionInfo()\$otherPkgs\$limma\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scales version" $(R --vanilla --slave -e "library(scales); cat(sessionInfo()\$otherPkgs\$scales\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rjson version" $(R --vanilla --slave -e "library(rjson); cat(sessionInfo()\$otherPkgs\$rjson\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", getopt version" $(R --vanilla --slave -e "library(getopt); cat(sessionInfo()\$otherPkgs\$getopt\$Version)" 2> /dev/null | grep -v -i "WARNING: ") ]]></version_command> <command detect_errors="exit_code"><![CDATA[ @@ -94,6 +95,10 @@ && cp '$outReport.files_path'/*.tsv output_dir/ + +#if $out.rscript: + && cp '$__tool_directory__/edger.R' '$rscript' +#end if ]]></command> <inputs> @@ -222,6 +227,7 @@ label="Output Normalised Counts Table?" help="Output a file containing the normalised counts, these are in log2 counts per million (logCPM). Default: No"> </param> + <param name="rscript" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used will be provided as a text file in the output. Default: No"/> <param name="rdaOption" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Output RData file?" help="Output all the data used by R to construct the plots and tables, can be loaded into R. A link to the RData file will be provided in the HTML report. Default: No"> @@ -259,6 +265,9 @@ <collection name="outTables" type="list" label="${tool.name} on ${on_string}: Tables"> <discover_datasets pattern="(?P<name>.+)\.tsv$" format="tabular" directory="output_dir" visible="false" /> </collection> + <data name="rscript" format="txt" label="${tool.name} on ${on_string}: Rscript"> + <filter>out['rscript']</filter> + </data> </outputs> <tests> @@ -278,8 +287,18 @@ </repeat> <param name="normalisationOption" value="TMM" /> <output_collection name="outTables" count="2"> - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT.tsv" /> - <element name="edgeR_WT-Mut" ftype="tabular" file="edgeR_WT-Mut.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*0.4582" /> + </assert_contents> + </element> + <element name="edgeR_WT-Mut" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*-0.4582" /> + </assert_contents> + </element> </output_collection> <output name="outReport" > <assert_contents> @@ -305,12 +324,18 @@ </repeat> <param name="normalisationOption" value="TMM" /> <output_collection name="outTables" count="1"> - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_anno.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="EntrezID.*Symbol.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*Abca4.*0.4582" /> + </assert_contents> + </element> </output_collection> </test> - <!-- Ensure RData file can be output --> + <!-- Ensure RScript and RData file can be output --> <test> <param name="format" value="matrix" /> + <param name="rscript" value="True"/> <param name="rdaOption" value="true" /> <param name="counts" value="matrix.txt" /> <repeat name="rep_factor"> @@ -326,6 +351,7 @@ <has_text text="RData" /> </assert_contents> </output> + <output name="rscript" value="out_rscript.txt"/> </test> <!-- Ensure secondary factors work --> <test> @@ -344,7 +370,12 @@ </repeat> <param name="normalisationOption" value="TMM" /> <output_collection name="outTables" count="1" > - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_2fact.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*0.4584" /> + </assert_contents> + </element> </output_collection> </test> <!-- Ensure factors file input works --> @@ -358,7 +389,12 @@ </repeat> <param name="normalisationOption" value="TMM" /> <output_collection name="outTables" count="1"> - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_2fact.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*0.4584" /> + </assert_contents> + </element> </output_collection> </test> <!-- Ensure normalised counts file output works--> @@ -375,8 +411,18 @@ </repeat> <param name="normalisationOption" value="TMM" /> <output_collection name="outTables" count="2"> - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT.tsv" /> - <element name="edgeR_normcounts" ftype="tabular" file="edgeR_normcounts.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*0.4582" /> + </assert_contents> + </element> + <element name="edgeR_normcounts" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*Mut1.*Mut2.*Mut3.*WT1.*WT2.*WT3" /> + <has_text_matching expression="11304.*15.7535" /> + </assert_contents> + </element> </output_collection> </test> <!-- Ensure likelihood ratio option works --> @@ -438,9 +484,24 @@ </repeat> <param name="normCounts" value="true" /> <output_collection name="outTables" count="3"> - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_2fact_anno.tsv" /> - <element name="edgeR_WT-Mut" ftype="tabular" file="edgeR_WT-Mut_2fact_anno.tsv" /> - <element name="edgeR_normcounts" ftype="tabular" file="edgeR_normcounts_anno.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="EntrezID.*Symbol.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*Abca4.*0.4584" /> + </assert_contents> + </element> + <element name="edgeR_WT-Mut" ftype="tabular" > + <assert_contents> + <has_text_matching expression="logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*Abca4.*-0.4584" /> + </assert_contents> + </element> + <element name="edgeR_normcounts" ftype="tabular" > + <assert_contents> + <has_text_matching expression="Mut1.*Mut2.*Mut3.*WT1.*WT2.*WT3" /> + <has_text_matching expression="11304.*Abca4.*15.7535" /> + </assert_contents> + </element> </output_collection> </test> <!-- Ensure filtering on CPM in Mnimum Samples works --> @@ -469,7 +530,13 @@ </assert_contents> </output> <output_collection name="outTables" count="1" > - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_filt.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*0.4568" /> + <not_has_text text="-0.0682" /> + </assert_contents> + </element> </output_collection> </test> <!-- Ensure filtering on Count in Minmum Samples works --> @@ -497,7 +564,14 @@ </assert_contents> </output> <output_collection name="outTables" count="1" > - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_filt.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*0.4568" /> + <not_has_text text="-0.0682" /> + </assert_contents> + </element> + </output_collection> </test> <!-- Ensure filtering on Total Count works --> @@ -527,7 +601,13 @@ </assert_contents> </output> <output_collection name="outTables" count="1" > - <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_filt.tsv" /> + <element name="edgeR_Mut-WT" ftype="tabular" > + <assert_contents> + <has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR" /> + <has_text_matching expression="11304.*0.4568" /> + <not_has_text text="-0.0682" /> + </assert_contents> + </element> </output_collection> </test> </tests> @@ -716,6 +796,7 @@ Optionally, under **Output Options** you can choose to output * a normalised counts table + * the R script used by this tool * an RData file -----
--- a/test-data/edgeR_Mut-WT.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"GeneID" "logFC" "logCPM" "F" "PValue" "FDR" -"11304" 0.458203001410391 15.530162861746 32.6285109553746 6.943370724917e-06 4.1660224349502e-05 -"11287" 0.188840644104212 17.6536729774735 20.5671667733158 0.000135453949597801 0.000406361848793403 -"11298" -0.138359578382475 17.6815280107154 10.8470695851279 0.00306012801564425 0.00612025603128849 -"11303" -0.0561156581317604 17.8897677663033 1.50815092591008 0.231329593888878 0.346994390833318 -"11305" -0.0579340818829784 18.1615839598046 1.09689306676368 0.305382540289637 0.366459048347564 -"11302" -0.0682406105165454 10.0898264751075 0.137130529665157 0.884266488139469 0.884266488139469
--- a/test-data/edgeR_Mut-WT_2fact.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"GeneID" "logFC" "logCPM" "F" "PValue" "FDR" -"11287" 0.189281291475186 17.6499778192954 198.646314971919 7.90598427634242e-09 4.74359056580545e-08 -"11298" -0.13798041694802 17.6843133699537 96.2224552671758 4.15830411749776e-06 1.24749123524933e-05 -"11304" 0.458490715244216 15.526484673111 14.5864146735617 0.00244295799161999 0.00488591598323999 -"11303" -0.0560600217169691 17.8909334307093 6.53006937817236 0.0442859767053646 0.0664289650580469 -"11305" -0.0585095825423414 18.1629882429457 1.07140336604322 0.32103822810743 0.385245873728915 -"11302" -0.0716631320244627 10.0898336653124 0.376796260569999 0.878304702615846 0.878304702615846
--- a/test-data/edgeR_Mut-WT_2fact_anno.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"EntrezID" "Symbol" "GeneName" "Chr" "Length" "logFC" "logCPM" "F" "PValue" "FDR" -11287 "Pzp" "pregnancy zone protein" 6 4681 0.189281947498313 17.6499778192954 198.646315096405 7.90598424818915e-09 4.74359054891349e-08 -11298 "Aanat" "arylalkylamine N-acetyltransferase" 11 1455 -0.137980416947824 17.6843133699537 96.2224553233548 4.15830411749738e-06 1.24749123524921e-05 -11304 "Abca4" "ATP-binding cassette, sub-family A (ABC1), member 4" 3 7248 0.45849071524422 15.526484673111 14.5864146737822 0.00244295799149183 0.00488591598298366 -11303 "Abca1" "ATP-binding cassette, sub-family A (ABC1), member 1" 4 10260 -0.0560600215744048 17.8909334307093 6.53006938009001 0.0442859767053567 0.066428965058035 -11305 "Abca2" "ATP-binding cassette, sub-family A (ABC1), member 2" 2 8061 -0.0585095828508861 18.1629882429457 1.07140336564628 0.321038228193371 0.385245873832045 -11302 "Aatk" "apoptosis-associated tyrosine kinase" 11 5743 -0.0716631320197652 10.0898336653124 0.376796260576849 0.878304702615839 0.878304702615839
--- a/test-data/edgeR_Mut-WT_anno.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"EntrezID" "Symbol" "GeneName" "Chr" "Length" "logFC" "logCPM" "F" "PValue" "FDR" -11304 "Abca4" "ATP-binding cassette, sub-family A (ABC1), member 4" 3 7248 0.458203001410391 15.530162861746 32.6285109553746 6.943370724917e-06 4.1660224349502e-05 -11287 "Pzp" "pregnancy zone protein" 6 4681 0.188840644104212 17.6536729774735 20.5671667733158 0.000135453949597801 0.000406361848793403 -11298 "Aanat" "arylalkylamine N-acetyltransferase" 11 1455 -0.138359578382475 17.6815280107154 10.8470695851279 0.00306012801564425 0.00612025603128849 -11303 "Abca1" "ATP-binding cassette, sub-family A (ABC1), member 1" 4 10260 -0.0561156581317604 17.8897677663033 1.50815092591008 0.231329593888878 0.346994390833318 -11305 "Abca2" "ATP-binding cassette, sub-family A (ABC1), member 2" 2 8061 -0.0579340818829784 18.1615839598046 1.09689306676368 0.305382540289637 0.366459048347564 -11302 "Aatk" "apoptosis-associated tyrosine kinase" 11 5743 -0.0682406105165454 10.0898264751075 0.137130529665157 0.884266488139469 0.884266488139469
--- a/test-data/edgeR_Mut-WT_filt.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -"GeneID" "logFC" "logCPM" "F" "PValue" "FDR" -"11287" 0.187201149217925 17.6526225386971 165.500659651998 5.18054239620105e-10 2.59027119810053e-09 -"11298" -0.140077523013286 17.6838446963123 82.0496288033128 2.92613742709898e-06 7.31534356774746e-06 -"11304" 0.456820345055957 15.5288695886958 25.2675517854784 6.46433259176098e-05 0.00010773887652935 -"11303" -0.0578468398229744 17.8912127135125 5.26103367901545 0.0384341523491632 0.048042690436454 -"11305" -0.0593023205976883 18.1634104549086 0.864302521617601 0.363623540536245 0.363623540536245
--- a/test-data/edgeR_WT-Mut.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"GeneID" "logFC" "logCPM" "F" "PValue" "FDR" -"11304" -0.458203001410391 15.530162861746 32.6285109553746 6.943370724917e-06 4.1660224349502e-05 -"11287" -0.188840644104212 17.6536729774735 20.5671667733158 0.000135453949597801 0.000406361848793403 -"11298" 0.138359578382475 17.6815280107154 10.8470695851279 0.00306012801564425 0.00612025603128849 -"11303" 0.0561156581317604 17.8897677663033 1.50815092591008 0.231329593888878 0.346994390833318 -"11305" 0.0579340818829784 18.1615839598046 1.09689306676368 0.305382540289637 0.366459048347564 -"11302" 0.0682406105165454 10.0898264751075 0.137130529665157 0.884266488139469 0.884266488139469
--- a/test-data/edgeR_WT-Mut_2fact_anno.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"EntrezID" "Symbol" "GeneName" "Chr" "Length" "logFC" "logCPM" "F" "PValue" "FDR" -11287 "Pzp" "pregnancy zone protein" 6 4681 -0.189281947498313 17.6499778192954 198.646315096405 7.90598424818915e-09 4.74359054891349e-08 -11298 "Aanat" "arylalkylamine N-acetyltransferase" 11 1455 0.137980416947824 17.6843133699537 96.2224553233548 4.15830411749738e-06 1.24749123524921e-05 -11304 "Abca4" "ATP-binding cassette, sub-family A (ABC1), member 4" 3 7248 -0.45849071524422 15.526484673111 14.5864146737822 0.00244295799149183 0.00488591598298366 -11303 "Abca1" "ATP-binding cassette, sub-family A (ABC1), member 1" 4 10260 0.0560600215744048 17.8909334307093 6.53006938009001 0.0442859767053567 0.066428965058035 -11305 "Abca2" "ATP-binding cassette, sub-family A (ABC1), member 2" 2 8061 0.0585095828508861 18.1629882429457 1.07140336564628 0.321038228193371 0.385245873832045 -11302 "Aatk" "apoptosis-associated tyrosine kinase" 11 5743 0.0716631320197652 10.0898336653124 0.376796260576849 0.878304702615839 0.878304702615839
--- a/test-data/edgeR_normcounts.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"GeneID" "Mut1" "Mut2" "Mut3" "WT1" "WT2" "WT3" -"11287" 17.7717801382127 17.7103668584544 17.7656984572699 17.6075444214943 17.5078565133576 17.5637960881114 -"11298" 17.6504754185442 17.55181161064 17.6142553019077 17.7726234935868 17.6985800110028 17.7597848438911 -"11302" 9.64041099082467 9.8551982993804 9.60469198931215 9.52851478148979 9.97869946791847 9.78190633986473 -"11303" 17.8772707356813 17.7864068634935 17.9114914356477 17.9125147871338 17.8772755854201 17.9551530504837 -"11304" 15.753577788623 15.8510977521242 15.6551142861549 15.3537170121875 15.2168364952853 15.3165751633072 -"11305" 18.0400277799982 18.1407817993511 18.2048423497925 18.1807759635442 18.1818136580236 18.2026167343562
--- a/test-data/edgeR_normcounts_anno.tsv Tue Jan 30 04:07:08 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -"EntrezID" "Symbol" "GeneName" "Chr" "Length" "Mut1" "Mut2" "Mut3" "WT1" "WT2" "WT3" -11287 "Pzp" "pregnancy zone protein" 6 4681 17.7717801382127 17.7103668584544 17.7656984572699 17.6075444214943 17.5078565133576 17.5637960881114 -11298 "Aanat" "arylalkylamine N-acetyltransferase" 11 1455 17.6504754185442 17.55181161064 17.6142553019077 17.7726234935868 17.6985800110028 17.7597848438911 -11302 "Aatk" "apoptosis-associated tyrosine kinase" 11 5743 9.64041099082467 9.8551982993804 9.60469198931215 9.52851478148979 9.97869946791847 9.78190633986473 -11303 "Abca1" "ATP-binding cassette, sub-family A (ABC1), member 1" 4 10260 17.8772707356813 17.7864068634935 17.9114914356477 17.9125147871338 17.8772755854201 17.9551530504837 -11304 "Abca4" "ATP-binding cassette, sub-family A (ABC1), member 4" 3 7248 15.753577788623 15.8510977521242 15.6551142861549 15.3537170121875 15.2168364952853 15.3165751633072 -11305 "Abca2" "ATP-binding cassette, sub-family A (ABC1), member 2" 2 8061 18.0400277799982 18.1407817993511 18.2048423497925 18.1807759635442 18.1818136580236 18.2026167343562
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_rscript.txt Thu Apr 19 17:26:38 2018 -0400 @@ -0,0 +1,718 @@ +# This tool takes in a matrix of feature counts as well as gene annotations and +# outputs a table of top expressions as well as various plots for differential +# expression analysis +# +# ARGS: htmlPath", "R", 1, "character" -Path to html file linking to other outputs +# outPath", "o", 1, "character" -Path to folder to write all output to +# filesPath", "j", 2, "character" -JSON list object if multiple files input +# matrixPath", "m", 2, "character" -Path to count matrix +# factFile", "f", 2, "character" -Path to factor information file +# factInput", "i", 2, "character" -String containing factors if manually input +# annoPath", "a", 2, "character" -Path to input containing gene annotations +# contrastData", "C", 1, "character" -String containing contrasts of interest +# cpmReq", "c", 2, "double" -Float specifying cpm requirement +# cntReq", "z", 2, "integer" -Integer specifying minimum total count requirement +# sampleReq", "s", 2, "integer" -Integer specifying cpm requirement +# normCounts", "x", 0, "logical" -String specifying if normalised counts should be output +# rdaOpt", "r", 0, "logical" -String specifying if RData should be output +# lfcReq", "l", 1, "double" -Float specifying the log-fold-change requirement +# pValReq", "p", 1, "double" -Float specifying the p-value requirement +# pAdjOpt", "d", 1, "character" -String specifying the p-value adjustment method +# normOpt", "n", 1, "character" -String specifying type of normalisation used +# robOpt", "b", 0, "logical" -String specifying if robust options should be used +# lrtOpt", "t", 0, "logical" -String specifying whether to perform LRT test instead +# +# OUT: +# MDS Plot +# BCV Plot +# QL Plot +# MD Plot +# Expression Table +# HTML file linking to the ouputs +# Optional: +# Normalised counts Table +# RData file +# +# Author: Shian Su - registertonysu@gmail.com - Jan 2014 +# Modified by: Maria Doyle - Oct 2017 (some code taken from the DESeq2 wrapper) + +# Record starting time +timeStart <- as.character(Sys.time()) + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +# Load all required libraries +library(methods, quietly=TRUE, warn.conflicts=FALSE) +library(statmod, quietly=TRUE, warn.conflicts=FALSE) +library(splines, quietly=TRUE, warn.conflicts=FALSE) +library(edgeR, quietly=TRUE, warn.conflicts=FALSE) +library(limma, quietly=TRUE, warn.conflicts=FALSE) +library(scales, quietly=TRUE, warn.conflicts=FALSE) +library(getopt, quietly=TRUE, warn.conflicts=FALSE) + +################################################################################ +### Function Delcaration +################################################################################ +# Function to sanitise contrast equations so there are no whitespaces +# surrounding the arithmetic operators, leading or trailing whitespace +sanitiseEquation <- function(equation) { + equation <- gsub(" *[+] *", "+", equation) + equation <- gsub(" *[-] *", "-", equation) + equation <- gsub(" *[/] *", "/", equation) + equation <- gsub(" *[*] *", "*", equation) + equation <- gsub("^\\s+|\\s+$", "", equation) + return(equation) +} + +# Function to sanitise group information +sanitiseGroups <- function(string) { + string <- gsub(" *[,] *", ",", string) + string <- gsub("^\\s+|\\s+$", "", string) + return(string) +} + +# Function to change periods to whitespace in a string +unmake.names <- function(string) { + string <- gsub(".", " ", string, fixed=TRUE) + return(string) +} + +# Generate output folder and paths +makeOut <- function(filename) { + return(paste0(opt$outPath, "/", filename)) +} + +# Generating design information +pasteListName <- function(string) { + return(paste0("factors$", string)) +} + +# Create cata function: default path set, default seperator empty and appending +# true by default (Ripped straight from the cat function with altered argument +# defaults) +cata <- function(..., file=opt$htmlPath, sep="", fill=FALSE, labels=NULL, + append=TRUE) { + if (is.character(file)) + if (file == "") + file <- stdout() + else if (substring(file, 1L, 1L) == "|") { + file <- pipe(substring(file, 2L), "w") + on.exit(close(file)) + } + else { + file <- file(file, ifelse(append, "a", "w")) + on.exit(close(file)) + } + .Internal(cat(list(...), file, sep, fill, labels, append)) +} + +# Function to write code for html head and title +HtmlHead <- function(title) { + cata("<head>\n") + cata("<title>", title, "</title>\n") + cata("</head>\n") +} + +# Function to write code for html links +HtmlLink <- function(address, label=address) { + cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n") +} + +# Function to write code for html images +HtmlImage <- function(source, label=source, height=600, width=600) { + cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height) + cata("\" width=\"", width, "\"/>\n") +} + +# Function to write code for html list items +ListItem <- function(...) { + cata("<li>", ..., "</li>\n") +} + +TableItem <- function(...) { + cata("<td>", ..., "</td>\n") +} + +TableHeadItem <- function(...) { + cata("<th>", ..., "</th>\n") +} + +################################################################################ +### Input Processing +################################################################################ + +# Collect arguments from command line +args <- commandArgs(trailingOnly=TRUE) + +# Get options, using the spec as defined by the enclosed list. +# Read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "htmlPath", "R", 1, "character", + "outPath", "o", 1, "character", + "filesPath", "j", 2, "character", + "matrixPath", "m", 2, "character", + "factFile", "f", 2, "character", + "factInput", "i", 2, "character", + "annoPath", "a", 2, "character", + "contrastData", "C", 1, "character", + "cpmReq", "c", 1, "double", + "totReq", "y", 0, "logical", + "cntReq", "z", 1, "integer", + "sampleReq", "s", 1, "integer", + "normCounts", "x", 0, "logical", + "rdaOpt", "r", 0, "logical", + "lfcReq", "l", 1, "double", + "pValReq", "p", 1, "double", + "pAdjOpt", "d", 1, "character", + "normOpt", "n", 1, "character", + "robOpt", "b", 0, "logical", + "lrtOpt", "t", 0, "logical"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + + +if (is.null(opt$matrixPath) & is.null(opt$filesPath)) { + cat("A counts matrix (or a set of counts files) is required.\n") + q(status=1) +} + +if (is.null(opt$cpmReq)) { + filtCPM <- FALSE +} else { + filtCPM <- TRUE +} + +if (is.null(opt$cntReq) || is.null(opt$sampleReq)) { + filtSmpCount <- FALSE +} else { + filtSmpCount <- TRUE +} + +if (is.null(opt$totReq)) { + filtTotCount <- FALSE +} else { + filtTotCount <- TRUE +} + +if (is.null(opt$lrtOpt)) { + wantLRT <- FALSE +} else { + wantLRT <- TRUE +} + +if (is.null(opt$rdaOpt)) { + wantRda <- FALSE +} else { + wantRda <- TRUE +} + +if (is.null(opt$annoPath)) { + haveAnno <- FALSE +} else { + haveAnno <- TRUE +} + +if (is.null(opt$normCounts)) { + wantNorm <- FALSE +} else { + wantNorm <- TRUE +} + +if (is.null(opt$robOpt)) { + wantRobust <- FALSE +} else { + wantRobust <- TRUE +} + + +if (!is.null(opt$filesPath)) { + # Process the separate count files (adapted from DESeq2 wrapper) + library("rjson") + parser <- newJSONParser() + parser$addData(opt$filesPath) + factorList <- parser$getObject() + factors <- sapply(factorList, function(x) x[[1]]) + filenamesIn <- unname(unlist(factorList[[1]][[2]])) + sampleTable <- data.frame(sample=basename(filenamesIn), + filename=filenamesIn, + row.names=filenamesIn, + stringsAsFactors=FALSE) + for (factor in factorList) { + factorName <- factor[[1]] + sampleTable[[factorName]] <- character(nrow(sampleTable)) + lvls <- sapply(factor[[2]], function(x) names(x)) + for (i in seq_along(factor[[2]])) { + files <- factor[[2]][[i]][[1]] + sampleTable[files,factorName] <- lvls[i] + } + sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) + } + rownames(sampleTable) <- sampleTable$sample + rem <- c("sample","filename") + factors <- sampleTable[, !(names(sampleTable) %in% rem), drop=FALSE] + + #read in count files and create single table + countfiles <- lapply(sampleTable$filename, function(x){read.delim(x, row.names=1)}) + counts <- do.call("cbind", countfiles) + +} else { + # Process the single count matrix + counts <- read.table(opt$matrixPath, header=TRUE, sep="\t", stringsAsFactors=FALSE) + row.names(counts) <- counts[, 1] + counts <- counts[ , -1] + countsRows <- nrow(counts) + + # Process factors + if (is.null(opt$factInput)) { + factorData <- read.table(opt$factFile, header=TRUE, sep="\t") + factors <- factorData[, -1, drop=FALSE] + } else { + factors <- unlist(strsplit(opt$factInput, "|", fixed=TRUE)) + factorData <- list() + for (fact in factors) { + newFact <- unlist(strsplit(fact, split="::")) + factorData <- rbind(factorData, newFact) + } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor. + + # Set the row names to be the name of the factor and delete first row + row.names(factorData) <- factorData[, 1] + factorData <- factorData[, -1] + factorData <- sapply(factorData, sanitiseGroups) + factorData <- sapply(factorData, strsplit, split=",") + factorData <- sapply(factorData, make.names) + # Transform factor data into data frame of R factor objects + factors <- data.frame(factorData) + } +} + + # if annotation file provided +if (haveAnno) { + geneanno <- read.table(opt$annoPath, header=TRUE, sep="\t", stringsAsFactors=FALSE) +} + +#Create output directory +dir.create(opt$outPath, showWarnings=FALSE) + +# Split up contrasts separated by comma into a vector then sanitise +contrastData <- unlist(strsplit(opt$contrastData, split=",")) +contrastData <- sanitiseEquation(contrastData) +contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) + +bcvOutPdf <- makeOut("bcvplot.pdf") +bcvOutPng <- makeOut("bcvplot.png") +qlOutPdf <- makeOut("qlplot.pdf") +qlOutPng <- makeOut("qlplot.png") +mdsOutPdf <- makeOut("mdsplot.pdf") +mdsOutPng <- makeOut("mdsplot.png") +mdOutPdf <- character() # Initialise character vector +mdOutPng <- character() +topOut <- character() +for (i in 1:length(contrastData)) { + mdOutPdf[i] <- makeOut(paste0("mdplot_", contrastData[i], ".pdf")) + mdOutPng[i] <- makeOut(paste0("mdplot_", contrastData[i], ".png")) + topOut[i] <- makeOut(paste0("edgeR_", contrastData[i], ".tsv")) +} # Save output paths for each contrast as vectors +normOut <- makeOut("edgeR_normcounts.tsv") +rdaOut <- makeOut("edgeR_analysis.RData") +sessionOut <- makeOut("session_info.txt") + +# Initialise data for html links and images, data frame with columns Label and +# Link +linkData <- data.frame(Label=character(), Link=character(), stringsAsFactors=FALSE) +imageData <- data.frame(Label=character(), Link=character(), stringsAsFactors=FALSE) + +# Initialise vectors for storage of up/down/neutral regulated counts +upCount <- numeric() +downCount <- numeric() +flatCount <- numeric() + +################################################################################ +### Data Processing +################################################################################ + +# Extract counts and annotation data +data <- list() +data$counts <- counts +if (haveAnno) { + data$genes <- geneanno +} else { + data$genes <- data.frame(GeneID=row.names(counts)) +} + +# If filter crieteria set, filter out genes that do not have a required cpm/counts in a required number of +# samples. Default is no filtering +preFilterCount <- nrow(data$counts) + +if (filtCPM || filtSmpCount || filtTotCount) { + + if (filtTotCount) { + keep <- rowSums(data$counts) >= opt$cntReq + } else if (filtSmpCount) { + keep <- rowSums(data$counts >= opt$cntReq) >= opt$sampleReq + } else if (filtCPM) { + keep <- rowSums(cpm(data$counts) >= opt$cpmReq) >= opt$sampleReq + } + + data$counts <- data$counts[keep, ] + data$genes <- data$genes[keep, , drop=FALSE] +} + +postFilterCount <- nrow(data$counts) +filteredCount <- preFilterCount-postFilterCount + +# Creating naming data +samplenames <- colnames(data$counts) +sampleanno <- data.frame("sampleID"=samplenames, factors) + + +# Generating the DGEList object "data" +data$samples <- sampleanno +data$samples$lib.size <- colSums(data$counts) +data$samples$norm.factors <- 1 +row.names(data$samples) <- colnames(data$counts) +data <- new("DGEList", data) + +# Name rows of factors according to their sample +row.names(factors) <- names(data$counts) +factorList <- sapply(names(factors), pasteListName) + +formula <- "~0" +for (i in 1:length(factorList)) { + formula <- paste(formula, factorList[i], sep="+") +} + +formula <- formula(formula) +design <- model.matrix(formula) + +for (i in 1:length(factorList)) { + colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) +} + +# Calculating normalising factor, estimating dispersion +data <- calcNormFactors(data, method=opt$normOpt) + +if (wantRobust) { + data <- estimateDisp(data, design=design, robust=TRUE) +} else { + data <- estimateDisp(data, design=design) +} + +# Generate contrasts information +contrasts <- makeContrasts(contrasts=contrastData, levels=design) + +################################################################################ +### Data Output +################################################################################ + +# Plot MDS +labels <- names(counts) +png(mdsOutPng, width=600, height=600) +# Currently only using a single factor +plotMDS(data, labels=labels, col=as.numeric(factors[, 1]), cex=0.8, main="MDS Plot") +imageData[1, ] <- c("MDS Plot", "mdsplot.png") +invisible(dev.off()) + +pdf(mdsOutPdf) +plotMDS(data, labels=labels, cex=0.5) +linkData[1, ] <- c("MDS Plot.pdf", "mdsplot.pdf") +invisible(dev.off()) + +# BCV Plot +png(bcvOutPng, width=600, height=600) +plotBCV(data, main="BCV Plot") +imgName <- "BCV Plot" +imgAddr <- "bcvplot.png" +imageData <- rbind(imageData, c(imgName, imgAddr)) +invisible(dev.off()) + +pdf(bcvOutPdf) +plotBCV(data, main="BCV Plot") +linkName <- paste0("BCV Plot.pdf") +linkAddr <- paste0("bcvplot.pdf") +linkData <- rbind(linkData, c(linkName, linkAddr)) +invisible(dev.off()) + +# Generate fit +if (wantLRT) { + + fit <- glmFit(data, design) + +} else { + + if (wantRobust) { + fit <- glmQLFit(data, design, robust=TRUE) + } else { + fit <- glmQLFit(data, design) + } + + # Plot QL dispersions + png(qlOutPng, width=600, height=600) + plotQLDisp(fit, main="QL Plot") + imgName <- "QL Plot" + imgAddr <- "qlplot.png" + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) + + pdf(qlOutPdf) + plotQLDisp(fit, main="QL Plot") + linkName <- "QL Plot.pdf" + linkAddr <- "qlplot.pdf" + linkData <- rbind(linkData, c(linkName, linkAddr)) + invisible(dev.off()) +} + + # Save normalised counts (log2cpm) +if (wantNorm) { + normalisedCounts <- cpm(data, normalized.lib.sizes=TRUE, log=TRUE) + normalisedCounts <- data.frame(data$genes, normalisedCounts) + write.table (normalisedCounts, file=normOut, row.names=FALSE, sep="\t") + linkData <- rbind(linkData, c("edgeR_normcounts.tsv", "edgeR_normcounts.tsv")) +} + + +for (i in 1:length(contrastData)) { + if (wantLRT) { + res <- glmLRT(fit, contrast=contrasts[, i]) + } else { + res <- glmQLFTest(fit, contrast=contrasts[, i]) + } + + status = decideTestsDGE(res, adjust.method=opt$pAdjOpt, p.value=opt$pValReq, + lfc=opt$lfcReq) + sumStatus <- summary(status) + + # Collect counts for differential expression + upCount[i] <- sumStatus["Up", ] + downCount[i] <- sumStatus["Down", ] + flatCount[i] <- sumStatus["NotSig", ] + + # Write top expressions table + top <- topTags(res, n=Inf, sort.by="PValue") + write.table(top, file=topOut[i], row.names=FALSE, sep="\t") + + linkName <- paste0("edgeR_", contrastData[i], ".tsv") + linkAddr <- paste0("edgeR_", contrastData[i], ".tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + + # Plot MD (log ratios vs mean difference) using limma package + pdf(mdOutPdf[i]) + limma::plotMD(res, status=status, + main=paste("MD Plot:", unmake.names(contrastData[i])), + col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"), + xlab="Average Expression", ylab="logFC") + + abline(h=0, col="grey", lty=2) + + linkName <- paste0("MD Plot_", contrastData[i], ".pdf") + linkAddr <- paste0("mdplot_", contrastData[i], ".pdf") + linkData <- rbind(linkData, c(linkName, linkAddr)) + invisible(dev.off()) + + png(mdOutPng[i], height=600, width=600) + limma::plotMD(res, status=status, + main=paste("MD Plot:", unmake.names(contrastData[i])), + col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"), + xlab="Average Expression", ylab="logFC") + + abline(h=0, col="grey", lty=2) + + imgName <- paste0("MD Plot_", contrastData[i], ".png") + imgAddr <- paste0("mdplot_", contrastData[i], ".png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) +} +sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) +row.names(sigDiff) <- contrastData + +# Save relevant items as rda object +if (wantRda) { + if (wantNorm) { + save(counts, data, status, normalisedCounts, labels, factors, fit, res, top, contrasts, design, + file=rdaOut, ascii=TRUE) + } else { + save(counts, data, status, labels, factors, fit, res, top, contrasts, design, + file=rdaOut, ascii=TRUE) + } + linkData <- rbind(linkData, c("edgeR_analysis.RData", "edgeR_analysis.RData")) +} + +# Record session info +writeLines(capture.output(sessionInfo()), sessionOut) +linkData <- rbind(linkData, c("Session Info", "session_info.txt")) + +# Record ending time and calculate total run time +timeEnd <- as.character(Sys.time()) +timeTaken <- capture.output(round(difftime(timeEnd, timeStart), digits=3)) +timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE) + +################################################################################ +### HTML Generation +################################################################################ + +# Clear file +cat("", file=opt$htmlPath) + +cata("<html>\n") + +cata("<body>\n") +cata("<h3>edgeR Analysis Output:</h3>\n") +cata("Links to PDF copies of plots are in 'Plots' section below.<br />\n") + +HtmlImage(imageData$Link[1], imageData$Label[1]) + +for (i in 2:nrow(imageData)) { + HtmlImage(imageData$Link[i], imageData$Label[i]) +} + +cata("<h4>Differential Expression Counts:</h4>\n") + +cata("<table border=\"1\" cellpadding=\"4\">\n") +cata("<tr>\n") +TableItem() +for (i in colnames(sigDiff)) { + TableHeadItem(i) +} +cata("</tr>\n") +for (i in 1:nrow(sigDiff)) { + cata("<tr>\n") + TableHeadItem(unmake.names(row.names(sigDiff)[i])) + for (j in 1:ncol(sigDiff)) { + TableItem(as.character(sigDiff[i, j])) + } + cata("</tr>\n") +} +cata("</table>") + +cata("<h4>Plots:</h4>\n") +for (i in 1:nrow(linkData)) { + if (grepl(".pdf", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<h4>Tables:</h4>\n") +for (i in 1:nrow(linkData)) { + if (grepl(".tsv", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +if (wantRda) { + cata("<h4>R Data Objects:</h4>\n") + for (i in 1:nrow(linkData)) { + if (grepl(".RData", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } + } +} + +cata("<p>Alt-click links to download file.</p>\n") +cata("<p>Click floppy disc icon associated history item to download ") +cata("all files.</p>\n") +cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n") + +cata("<h4>Additional Information</h4>\n") +cata("<ul>\n") + +if (filtCPM || filtSmpCount || filtTotCount) { + if (filtCPM) { + tempStr <- paste("Genes without more than", opt$cmpReq, + "CPM in at least", opt$sampleReq, "samples are insignificant", + "and filtered out.") + } else if (filtSmpCount) { + tempStr <- paste("Genes without more than", opt$cntReq, + "counts in at least", opt$sampleReq, "samples are insignificant", + "and filtered out.") + } else if (filtTotCount) { + tempStr <- paste("Genes without more than", opt$cntReq, + "counts, after summing counts for all samples, are insignificant", + "and filtered out.") + } + + ListItem(tempStr) + filterProp <- round(filteredCount/preFilterCount*100, digits=2) + tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, + "%) genes were filtered out for low expression.") + ListItem(tempStr) +} +ListItem(opt$normOpt, " was the method used to normalise library sizes.") +if (wantLRT) { + ListItem("The edgeR likelihood ratio test was used.") +} else { + if (wantRobust) { + ListItem("The edgeR quasi-likelihood test was used with robust settings (robust=TRUE with estimateDisp and glmQLFit).") + } else { + ListItem("The edgeR quasi-likelihood test was used.") + } +} +if (opt$pAdjOpt!="none") { + if (opt$pAdjOpt=="BH" || opt$pAdjOpt=="BY") { + tempStr <- paste0("MD-Plot highlighted genes are significant at FDR ", + "of ", opt$pValReq," and exhibit log2-fold-change of at ", + "least ", opt$lfcReq, ".") + ListItem(tempStr) + } else if (opt$pAdjOpt=="holm") { + tempStr <- paste0("MD-Plot highlighted genes are significant at adjusted ", + "p-value of ", opt$pValReq," by the Holm(1979) ", + "method, and exhibit log2-fold-change of at least ", + opt$lfcReq, ".") + ListItem(tempStr) + } +} else { + tempStr <- paste0("MD-Plot highlighted genes are significant at p-value ", + "of ", opt$pValReq," and exhibit log2-fold-change of at ", + "least ", opt$lfcReq, ".") + ListItem(tempStr) +} +cata("</ul>\n") + +cata("<h4>Summary of experimental data:</h4>\n") + +cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP(S)*</p>\n") + +cata("<table border=\"1\" cellpadding=\"3\">\n") +cata("<tr>\n") +TableHeadItem("SampleID") +TableHeadItem(names(factors)[1], " (Primary Factor)") + + if (ncol(factors) > 1) { + for (i in names(factors)[2:length(names(factors))]) { + TableHeadItem(i) + } + cata("</tr>\n") + } + +for (i in 1:nrow(factors)) { + cata("<tr>\n") + TableHeadItem(row.names(factors)[i]) + for (j in 1:ncol(factors)) { + TableItem(as.character(unmake.names(factors[i, j]))) + } + cata("</tr>\n") +} +cata("</table>") + +for (i in 1:nrow(linkData)) { + if (grepl("session_info", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<table border=\"0\">\n") +cata("<tr>\n") +TableItem("Task started at:"); TableItem(timeStart) +cata("</tr>\n") +cata("<tr>\n") +TableItem("Task ended at:"); TableItem(timeEnd) +cata("</tr>\n") +cata("<tr>\n") +TableItem("Task run time:"); TableItem(timeTaken) +cata("<tr>\n") +cata("</table>\n") + +cata("</body>\n") +cata("</html>")