comparison test-data/gentest.R @ 2:2a90d2fd3336 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dada2 commit f2a33fe115fef9d711112b53136cf7619f1b19be"
author iuc
date Mon, 16 Mar 2020 08:41:08 -0400
parents 371afe17d247
children 52a911972c02
comparison
equal deleted inserted replaced
1:fa11922f3d91 2:2a90d2fd3336
1 library(dada2, quietly=T) 1 library(dada2, quietly=T)
2 library(ggplot2, quietly=T) 2 library(ggplot2, quietly=T)
3 3
4 fwd <- c('F3D0_S188_L001_R1_001.fastq.gz') 4 sample.names <- c('F3D0_S188_L001', 'F3D141_S207_L001')
5 rev <- c('F3D0_S188_L001_R2_001.fastq.gz') 5 fwd <- c('F3D0_S188_L001_R1_001.fastq.gz', 'F3D141_S207_L001_R1_001.fastq.gz')
6 rev <- c('F3D0_S188_L001_R2_001.fastq.gz', 'F3D141_S207_L001_R2_001.fastq.gz')
6 7
7 sample.names <- c('F3D0_S188_L001') 8 filt.fwd <- c('filterAndTrim_F3D0_R1.fq.gz', 'filterAndTrim_F3D141_R1.fq.gz')
9 filt.rev <- c('filterAndTrim_F3D0_R2.fq.gz', 'filterAndTrim_F3D141_R2.fq.gz')
10
11 print("filterAndTrim")
12
13 for(i in 1:length(fwd)){
14 ftout <- filterAndTrim(fwd[i], filt.fwd[i], rev[i], filt.rev[i])
15 b <- paste(strsplit(fwd[i], ".", fixed=T)[[1]][1], "tab", sep=".")
16 write.table(ftout, b, quote=F, sep="\t", col.names=NA)
17 }
18
19 # In the test only the 1st data set is used
20 t <- data.frame()
21 t <- rbind(t, ftout[1,])
22 colnames(t) <- colnames(ftout)
23 rownames(t) <- rownames(ftout)[1]
24 write.table(t, "filterAndTrim.tab", quote=F, sep="\t", col.names=NA)
8 25
9 names(fwd) <- sample.names 26 names(fwd) <- sample.names
10 names(rev) <- sample.names 27 names(rev) <- sample.names
11 28 names(filt.fwd) <- sample.names
12 29 names(filt.rev) <- sample.names
13 filt.fwd <- c('filterAndTrim_F3D0_R1.fq.gz')
14 filt.rev <- c('filterAndTrim_F3D0_R2.fq.gz')
15
16 ftout <- filterAndTrim(fwd, filt.fwd, rev, filt.rev)
17
18 # In the test no name can be given to the collection
19 rownames(ftout) <- c( 'Unnamed Collection' )
20 write.table(ftout, "filterAndTrim_F3D0.tab", quote=F, sep="\t", col.names=NA)
21 30
22 # Plot quality profile (just for one file, Galaxy compares with sim_size) 31 # Plot quality profile (just for one file, Galaxy compares with sim_size)
23 32 print("plots")
24 qp <- plotQualityProfile(fwd) 33 qp <- plotQualityProfile(fwd)
34 ggsave('qualityProfile_fwd.pdf', qp, width = 20,height = 15,units = c("cm"))
35 qp <- plotQualityProfile(rev)
36 ggsave('qualityProfile_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
37 qp <- plotQualityProfile(fwd[1])
25 ggsave('qualityProfile.pdf', qp, width = 20,height = 15,units = c("cm")) 38 ggsave('qualityProfile.pdf', qp, width = 20,height = 15,units = c("cm"))
26 39
27 # Plot complexity (just for one file, Galaxy compares with sim_size) 40 # Plot complexity (just for one file, Galaxy compares with sim_size)
28 41
29 cp <- plotComplexity(fwd) 42 cp <- plotComplexity(fwd)
43 ggsave('complexity_fwd.pdf', cp, width = 20,height = 15,units = c("cm"))
44 cp <- plotComplexity(rev)
45 ggsave('complexity_rev.pdf', cp, width = 20,height = 15,units = c("cm"))
46 cp <- plotComplexity(fwd[1])
30 ggsave('complexity.pdf', cp, width = 20,height = 15,units = c("cm")) 47 ggsave('complexity.pdf', cp, width = 20,height = 15,units = c("cm"))
31 48
32 49
33 # learn Errors 50 # learn Errors
51 print("learnErrors")
34 err.fwd <- learnErrors(filt.fwd) 52 err.fwd <- learnErrors(filt.fwd)
35 saveRDS(err.fwd, file='learnErrors_F3D0_R1.Rdata') 53 saveRDS(err.fwd, file='learnErrors_R1.Rdata')
36 plot <- plotErrors(err.fwd) 54 plot <- plotErrors(err.fwd)
37 ggsave('learnErrors_F3D0_R1.pdf', plot, width = 20,height = 15,units = c("cm")) 55 ggsave('learnErrors_R1.pdf', plot, width = 20,height = 15,units = c("cm"))
38 56
39 err.rev <- learnErrors(filt.fwd) 57 err.rev <- learnErrors(filt.rev)
40 saveRDS(err.rev, file='learnErrors_F3D0_R2.Rdata') 58 saveRDS(err.rev, file='learnErrors_R2.Rdata')
41 plot <- plotErrors(err.rev) 59 plot <- plotErrors(err.rev)
42 ggsave('learnErrors_F3D0_R2.pdf', plot, width = 20,height = 15,units = c("cm")) 60 ggsave('learnErrors.pdf', plot, width = 20,height = 15,units = c("cm"))
43 61
44 # dada 62 # dada
63 print("dada")
45 dada.fwd <- dada(filt.fwd, err.fwd) 64 dada.fwd <- dada(filt.fwd, err.fwd)
46 saveRDS(dada.fwd, file="dada_F3D0_R1.Rdata")
47 dada.rev <- dada(filt.rev, err.rev) 65 dada.rev <- dada(filt.rev, err.rev)
48 saveRDS(dada.rev, file="dada_F3D0_R2.Rdata") 66 for( id in sample.names ){
67 saveRDS(dada.fwd[[id]], file=paste("dada_", id,"_R1.Rdata", sep=""))
68 saveRDS(dada.rev[[id]], file=paste("dada_", id,"_R2.Rdata", sep=""))
69 }
49 70
50 # merge pairs 71 # merge pairs
72 print("mergePairs")
51 merged <- mergePairs(dada.fwd, filt.fwd, dada.rev, filt.rev) 73 merged <- mergePairs(dada.fwd, filt.fwd, dada.rev, filt.rev)
52 saveRDS(merged, file='mergePairs_F3D0.Rdata') 74 for( id in sample.names ){
75 saveRDS(merged[[id]], file=paste("mergePairs_", id,".Rdata", sep=""))
76 }
77
53 78
54 # make sequence table 79 # make sequence table
80 print("makeSequenceTable")
55 seqtab <- makeSequenceTable(merged) 81 seqtab <- makeSequenceTable(merged)
56 write.table(t(seqtab), file="makeSequenceTable_F3D0.tab", quote=F, sep="\t", row.names = T, col.names = NA) 82 write.table(t(seqtab), file="makeSequenceTable.tab", quote=F, sep="\t", row.names = T, col.names = NA)
57 83
58 reads.per.seqlen <- tapply(colSums(seqtab), factor(nchar(getSequences(seqtab))), sum) 84 reads.per.seqlen <- tapply(colSums(seqtab), factor(nchar(getSequences(seqtab))), sum)
59 df <- data.frame(length=as.numeric(names(reads.per.seqlen)), count=reads.per.seqlen) 85 df <- data.frame(length=as.numeric(names(reads.per.seqlen)), count=reads.per.seqlen)
60 pdf( 'makeSequenceTable_F3D0.pdf' ) 86 pdf( 'makeSequenceTable.pdf' )
61 ggplot(data=df, aes(x=length, y=count)) + 87 ggplot(data=df, aes(x=length, y=count)) +
62 geom_col() + 88 geom_col() +
63 theme_bw() 89 theme_bw()
64 bequiet <- dev.off() 90 bequiet <- dev.off()
65 91
66 # remove bimera 92 # remove bimera
93 print("removeBimera")
67 seqtab.nochim <- removeBimeraDenovo(seqtab) 94 seqtab.nochim <- removeBimeraDenovo(seqtab)
68 write.table(t(seqtab), file="removeBimeraDenovo_F3D0.tab", quote=F, sep="\t", row.names = T, col.names = NA) 95 write.table(t(seqtab), file="removeBimeraDenovo.tab", quote=F, sep="\t", row.names = T, col.names = NA)
69 96
70 # assign taxonomy/species 97 # assign taxonomy/species
71 tl <- 'Level1,Level2,Level3,Level4,Level5' 98 tl <- 'Level1,Level2,Level3,Level4,Level5'
72 tl <- strsplit(tl, ",")[[1]] 99 tl <- strsplit(tl, ",")[[1]]
73 100
74 taxa <- assignTaxonomy(seqtab.nochim, 'reference.fa', outputBootstraps = T, taxLevels=c('Level1','Level2','Level3','Level4','Level5')) 101 set.seed(42)
102 print("assignTaxonomyAndSpecies")
103 taxa <- assignTaxonomy(seqtab.nochim, 'reference.fa.gz', outputBootstraps = T, taxLevels=tl, multithread = 1)
75 104
76 taxa$tax <- addSpecies(taxa$tax, 'reference_species.fa') 105 taxa$tax <- addSpecies(taxa$tax, 'reference_species.fa.gz')
77 write.table(taxa$tax, file = 'assignTaxonomyAddspecies_F3D0.tab', quote = F, sep = "\t", row.names = T, col.names = NA) 106 write.table(taxa$tax, file = 'assignTaxonomyAddspecies.tab', quote = F, sep = "\t", row.names = T, col.names = NA)
78 107
79 write.table(taxa$boot, file = 'assignTaxonomyAddspecies_F3D0_boot.tab', quote = F, sep = "\t", row.names = T, col.names = NA) 108 write.table(taxa$boot, file = 'assignTaxonomyAddspecies_boot.tab', quote = F, sep = "\t", row.names = T, col.names = NA)
80
81 109
82 110
83 ## Generate extra test data for parameter testing 111 ## Generate extra test data for parameter testing
112 print("alternatives")
113 filterAndTrim(fwd, c('filterAndTrim_single_F3D0_R1.fq.gz', 'filterAndTrim_single_F3D141_R1.fq.gz'), rm.phix = T, orient.fwd = 'TACGG')
84 114
85 filterAndTrim(fwd, c('filterAndTrim_single_F3D0_R1.fq.gz'), rm.phix = T, orient.fwd = 'TACGG') 115 filterAndTrim(fwd, c('filterAndTrim_single_trimmers_F3D0_R1.fq.gz', 'filterAndTrim_single_trimmers_F3D141_R1.fq.gz'), truncQ = 30, truncLen = 2, trimLeft = 150, trimRight = 2)
86 116
87 filterAndTrim(fwd, c('filterAndTrim_single_trimmers_F3D0_R1.fq.gz'), truncQ = 30, truncLen = 2, trimLeft = 150, trimRight = 2) 117 filterAndTrim(fwd, c('filterAndTrim_single_filters_F3D0_R1.fq.gz', 'filterAndTrim_single_filters_F3D141_R1.fq.gz'), maxLen = 255, minLen = 60, maxN = 100, minQ = 13, maxEE = 1)
88
89 filterAndTrim(fwd, c('filterAndTrim_single_filters_F3D0_R1.fq.gz'), maxLen = 255, minLen = 60, maxN = 100, minQ = 13, maxEE = 1)
90 118
91 119
92 merged_nondef <- mergePairs(dada.fwd, filt.fwd, dada.rev, filt.rev, minOverlap = 8, maxMismatch = 1, justConcatenate = TRUE, trimOverhang = TRUE) 120 merged_nondef <- mergePairs(dada.fwd, filt.fwd, dada.rev, filt.rev, minOverlap = 8, maxMismatch = 1, justConcatenate = TRUE, trimOverhang = TRUE)
93 saveRDS(merged_nondef, file='mergePairs_F3D0_nondefault.Rdata') 121 for( id in sample.names ){
94 122 saveRDS(merged_nondef[[id]], file=paste("mergePairs_", id,"_nondefault.Rdata", sep=""))
95 rb.dada.fwd <- removeBimeraDenovo(dada.fwd) 123 }
124 rb.dada.fwd <- removeBimeraDenovo(dada.fwd[["F3D0_S188_L001"]])
96 write.table(rb.dada.fwd, file = 'removeBimeraDenovo_F3D0_dada_uniques.tab', quote = F, sep = "\t", row.names = T, col.names = F) 125 write.table(rb.dada.fwd, file = 'removeBimeraDenovo_F3D0_dada_uniques.tab', quote = F, sep = "\t", row.names = T, col.names = F)
97 126
98 rb.merged <- removeBimeraDenovo(merged, method="pooled") 127 rb.merged <- removeBimeraDenovo(merged, method="pooled")
99 saveRDS(rb.merged, file='removeBimeraDenovo_F3D0_mergepairs.Rdata') 128 saveRDS(rb.merged, file='removeBimeraDenovo_F3D0_mergepairs.Rdata')
129
130 # SeqCounts
131 getN <- function(x){ sum(getUniques(x)) }
132
133 read.uniques <- function ( fname ) {
134 p <- read.table(fname, header=F, sep="\t")
135 n <-x[,2]
136 names(n)<-x[,1]
137 }
138
139
140 print("seqCounts ft")
141 samples = list()
142 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header=T, sep="\t", row.names=1)
143 dname <- "filter"
144 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]]
145 names(tdf) <- paste( dname, names(tdf) )
146 tdf <- cbind( data.frame(samples=names( samples )), tdf)
147 write.table(tdf, "seqCounts_filter.tab", quote=F, sep="\t", row.names = F, col.names = T)
148
149 samples = list()
150 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header=T, sep="\t", row.names=1)
151 samples[["F3D141_S207_L001_R1_001.tab"]] <- read.table("F3D141_S207_L001_R1_001.tab", header=T, sep="\t", row.names=1)
152 dname <- "filter"
153 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]]
154 tdf <- rbind(tdf, samples[["F3D141_S207_L001_R1_001.tab"]])
155 names(tdf) <- paste( dname, names(tdf) )
156 tdf <- cbind( data.frame(samples=names( samples )), tdf)
157 write.table(tdf, "seqCounts_filter_both.tab", quote=F, sep="\t", row.names = F, col.names = T)
158
159 print("seqCounts dada")
160 samples = list()
161 samples[["dada_F3D0_S188_L001_R1.Rdata"]] <- readRDS('dada_F3D0_S188_L001_R1.Rdata')
162 samples[["dada_F3D141_S207_L001_R1.Rdata"]] <- readRDS('dada_F3D141_S207_L001_R1.Rdata')
163 dname <- "dadaF"
164 tdf <- data.frame( samples = names(samples) )
165 tdf[[ dname ]] <- sapply(samples, getN)
166 write.table(tdf, "seqCounts_dadaF.tab", quote=F, sep="\t", row.names = F, col.names = T)
167
168 print("seqCounts mp")
169 samples = list()
170 samples[["mergePairs_F3D0_S188_L001.Rdata"]] <- readRDS('mergePairs_F3D0_S188_L001.Rdata')
171 samples[["mergePairs_F3D141_S207_L001.Rdata"]] <- readRDS('mergePairs_F3D141_S207_L001.Rdata')
172 dname <- "merge"
173 tdf <- data.frame( samples = names(samples) )
174 tdf[[ dname ]] <- sapply(samples, getN)
175 write.table(tdf, "seqCounts_merge.tab", quote=F, sep="\t", row.names = F, col.names = T)
176
177 print("seqCounts st")
178 samples = list()
179 samples <- t(as.matrix( read.table("makeSequenceTable.tab", header=T, sep="\t", row.names=1) ))
180 dname <- "seqtab"
181 tdf <- data.frame( samples = row.names(samples) )
182 tdf[[ dname ]] <- rowSums(samples)
183 write.table(tdf, "seqCounts_seqtab.tab", quote=F, sep="\t", row.names = F, col.names = T)
184
185 print("seqCounts rb")
186 samples = list()
187 samples <- t(as.matrix( read.table("removeBimeraDenovo.tab", header=T, sep="\t", row.names=1) ))
188 dname <- "nochim"
189 tdf <- data.frame( samples = row.names(samples) )
190 tdf[[ dname ]] <- rowSums(samples)
191 write.table(tdf, "seqCounts_nochim.tab", quote=F, sep="\t", row.names = F, col.names = T)
192