comparison IdValid.R @ 0:8c472c4f1bf5 draft

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tools/vigiechiro commit d2de8e10c11bfa3b04729e59bba58e08d23b56aa
author ecology
date Wed, 13 Mar 2019 11:18:58 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8c472c4f1bf5
1 #!/usr/bin/env Rscript
2
3 suppressMessages(library(data.table))
4
5 ValidHier=function(x,y) #used to write validator id over observer id
6 {
7 #cat(y)
8 if(is.na(y)){x}else{y}
9 }
10
11 f2p <- function(x) #get date-time data from recording file names
12 {
13 if (is.data.frame((x)[1])) {pretemps <- vector(length = nrow(x))}
14 op <- options(digits.secs = 3)
15 pretemps <- paste(substr(x, nchar(x) - 18, nchar(x)-4), ".", substr(x, nchar(x) - 2, nchar(x)), sep = "")
16 strptime(pretemps, "%Y%m%d_%H%M%OS",tz="UTC")-7200
17 }
18
19 args <- commandArgs(trailingOnly = TRUE)
20
21
22 IdCorrect=fread(args[1])
23 RefSeuil=fread(args[2])
24 #IdV=as.data.frame(subset(IdCorrect,select=observateur_taxon:validateur_probabilite))
25
26 #Step 0 :compute id score from 2nd Layer
27 test=match("participation",names(IdCorrect))
28 IdCorrect$IdScore=apply(as.data.frame(IdCorrect)[,(test+1):(ncol(IdCorrect)-1)],MARGIN=1,FUN=max)
29 #compute true success probabilities according to logistic regression issued from "Referentiel_seuils"
30 CorrSp=match(IdCorrect$ProbEsp_C2bs,RefSeuil$Espece)
31 PSp=RefSeuil$Pente[CorrSp]
32 ISp=RefSeuil$Int[CorrSp]
33 suppressWarnings(IdCorrect$IdProb<-mapply(FUN=function(w,x,y) if((!is.na(y))&(y>0)&(y<1000)) {(exp(y*w+x)/(1+exp(y*w+x)))}else{w} ,IdCorrect$IdScore,ISp,PSp))
34
35 #Step 1 :compute id with confidence regarding a hierarchy (validator > observer)
36 IdCorrect$IdV=mapply(ValidHier,IdCorrect$observateur_taxon,IdCorrect$validateur_taxon)
37 IdCorrect$ConfV=mapply(ValidHier,IdCorrect$observateur_probabilite
38 ,IdCorrect$validateur_probabilite)
39
40
41
42 #Step 2: Get numerictime data
43 suppressWarnings(IdCorrect$Session<-NULL)
44 suppressWarnings(IdCorrect$TimeNum<-NULL)
45
46 if (substr(IdCorrect$`nom du fichier`[1],2,2)=="i") #for car/walk transects
47 {
48 FileInfo=as.data.table(tstrsplit(IdCorrect$`nom du fichier`,"-"))
49 IdCorrect$Session=as.numeric(substr(FileInfo$V4,5,nchar(FileInfo$V4)))
50 TimeSec=as.data.table(tstrsplit(FileInfo$V5,"_"))
51 TimeSec=as.data.frame(TimeSec)
52 if(sum(TimeSec[,(ncol(TimeSec)-1)]!="00000")==0) #to deal with double Kaleidoscope treatments
53 {
54 print("NOMS DE FICHIERS NON CONFORMES")
55 print("Vous les avez probablement traiter 2 fois par Kaleidoscope")
56 stop("Merci de nous signaler cette erreur par mail pour correction")
57 }else{
58 IdCorrect$TimeNum=(IdCorrect$Session*800
59 +as.numeric(TimeSec[,(ncol(TimeSec)-1)])
60 +as.numeric(TimeSec[,(ncol(TimeSec))])/1000)
61 }
62
63 }else{
64 if(substr(IdCorrect$`nom du fichier`[1],2,2)=="a") #for stationary recordings
65 {
66 DateRec=as.POSIXlt(f2p(IdCorrect$`nom du fichier`))
67 Nuit=format(as.Date(DateRec-43200*(DateRec$hour<12)),format="%d/%m/%Y")
68 #Nuit[is.na(Nuit)]=0
69 IdCorrect$Session=Nuit
70 IdCorrect$TimeNum=as.numeric(DateRec)
71
72 }else{
73 print("NOMS DE FICHIERS NON CONFORMES")
74 stop("Ils doivent commencer par Cir (routier/pedestre) ou par Car (points fixes")
75 }
76 }
77
78
79
80
81 #Step 3 :treat sequentially each species identified by Tadarida-C
82 IdExtrap=vector() #to store the id extrapolated from validations
83 IdC2=IdCorrect[0,] #to store data in the right order
84 TypeE=vector() #to store the type of extrapolation made
85 for (j in 1:nlevels(as.factor(IdCorrect$ProbEsp_C2bs)))
86 {
87 IdSp=subset(IdCorrect
88 ,IdCorrect$ProbEsp_C2bs==levels(as.factor(IdCorrect$ProbEsp_C2bs))[j])
89 if(sum(is.na(IdSp$IdV))==(nrow(IdSp))) #case 1 : no validation no change
90 {
91 IdC2=rbind(IdC2,IdSp)
92 IdExtrap=c(IdExtrap,rep(IdSp$ProbEsp_C2bs[1],nrow(IdSp)))
93 TypeE=c(TypeE,rep(0,nrow(IdSp)))
94 }else{ #case 2: some validation
95 Vtemp=subset(IdSp,is.na(IdSp$IdV))
96 #case2A: validations are homogeneous
97 if(nlevels(as.factor(Vtemp$IdV))==1)
98 {
99 IdC2=rbind(IdC2,IdSp)
100 IdExtrap=c(IdExtrap,rep(Vtemp$IdV[1],nrow(IdSp)))
101 TypeE=c(TypeE,rep(2,nrow(IdSp)))
102 }else{
103 #case 2B: validations are heterogeneous
104 #case 2B1: some validations confirms the species identified by Tadarida and highest confidence are confirmed
105 subVT=subset(Vtemp,Vtemp$IdV==levels(as.factor(IdCorrect$ProbEsp_C2bs))[j])
106 subVF=subset(Vtemp,Vtemp$IdV!=levels(as.factor(IdCorrect$ProbEsp_C2bs))[j])
107 if((nrow(subVT)>0)&(max(subVT$IdProb)>max(subVF$IdProb)))
108 {
109 Vtemp=Vtemp[order(Vtemp$IdProb),]
110 test=(Vtemp$IdV!=Vtemp$ProbEsp_C2bs)
111 Fr1=max(which(test == TRUE)) #find the error with highest indices
112 Thr1=mean(Vtemp$IdProb[(Fr1):(Fr1+1)]) #define first threshold as the median confidence between the first error and the confirmed ID right over it
113 #id over this threshold are considered right
114 IdHC=subset(IdSp,IdSp$IdProb>Thr1)
115 IdC2=rbind(IdC2,IdHC)
116 IdExtrap=c(IdExtrap,rep(Vtemp$IdV[nrow(Vtemp)],nrow(IdHC)))
117 TypeE=c(TypeE,rep(2,nrow(IdHC)))
118 #id under this threshold are attributed to validated id closest in time
119 Vtemp=Vtemp[order(Vtemp$TimeNum),]
120 cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
121 CorrV=findInterval(IdSp$TimeNum, cuts)
122 IdE=Vtemp$IdV[CorrV]
123 IdEL=subset(IdE,IdSp$IdProb<=Thr1)
124 IdLC=subset(IdSp,IdSp$IdProb<=Thr1)
125 IdExtrap=c(IdExtrap,IdEL)
126 TypeE=c(TypeE,rep(1,length(IdEL)))
127 IdC2=rbind(IdC2,IdLC)
128
129
130 }else{
131 #case 2B2: all validations concerns errors
132 #id are extrapolated on time only
133 Vtemp=Vtemp[order(Vtemp$TimeNum),]
134 cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
135 CorrV=findInterval(IdSp$TimeNum, cuts)
136 IdE=Vtemp$IdV[CorrV]
137 IdExtrap=c(IdExtrap,IdE)
138 TypeE=c(TypeE,rep(1,length(IdE)))
139 IdC2=rbind(IdC2,IdSp)
140 }
141 }
142
143
144 }
145
146
147 }
148 test1=(nrow(IdC2)==length(IdExtrap))
149 test2=(nrow(IdC2)==nrow(IdCorrect))
150 if((test1==F)|(test2==F))
151 {
152 (stop("Erreur de traitement !!!"))
153 }
154
155 IdC2$IdExtrap=IdExtrap
156 IdC2$TypeE=TypeE
157
158
159 IdC2=IdC2[order(IdC2$IdProb,decreasing=T),]
160 IdC2=IdC2[order(IdC2$ConfV,decreasing=T),]
161 IdC2=IdC2[order(IdC2$`nom du fichier`),]
162 #discard duplicated species within the same files (= false positives corrected by 2nd layer)
163 IdC2=unique(IdC2,by=c("nom du fichier","IdExtrap"))
164
165 write.table(IdC2,"output.tabular",row.names=F,sep="\t",quote=FALSE,na="NA")