comparison IdValidTidy.R @ 1:eb19a5089b56 draft default tip

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tools/vigiechiro commit 7ef0e58cbcbf41088e359f00b6c86504c773c271
author ecology
date Fri, 26 Apr 2019 12:21:27 -0400
parents
children
comparison
equal deleted inserted replaced
0:8c472c4f1bf5 1:eb19a5089b56
1 #!/usr/bin/env Rscript
2
3 args <- commandArgs(trailingOnly = TRUE)
4
5 #print(args)
6
7 library(data.table)
8 library(methods)
9
10
11 ValidHier=function(x,y) #used to write validator id over observer id
12 {
13 if(y==""){x}else{y}
14 }
15
16 f2p <- function(x) #get date-time data from recording file names
17 {
18 if (is(x)[1] == "data.frame") {pretemps <- vector(length = nrow(x))}
19 op <- options(digits.secs = 3)
20 pretemps <- paste(substr(x, nchar(x) - 18, nchar(x)-4), ".", substr(x, nchar(x) - 2, nchar(x)), sep = "")
21 strptime(pretemps, "%Y%m%d_%H%M%OS",tz="UTC")-7200
22 }
23
24
25 IdCorrect=fread(args[1])
26
27 #Step 0 :compute id score from 2nd Layer
28 IdCorrect$IdProb=IdCorrect$tadarida_probabilite
29
30 IdCorrect$observateur_taxon[is.na(IdCorrect$observateur_taxon)]=""
31 IdCorrect$observateur_probabilite[is.na(IdCorrect$observateur_probabilite)]=""
32 IdCorrect$validateur_taxon[is.na(IdCorrect$validateur_taxon)]=""
33 IdCorrect$validateur_probabilite[is.na(IdCorrect$validateur_probabilite)]=""
34
35
36
37 #Step 1 :compute id with confidence regarding a hierarchy (validator > observer)
38 IdCorrect$IdV=mapply(ValidHier,IdCorrect$observateur_taxon,IdCorrect$validateur_taxon)
39 IdCorrect$ConfV=mapply(ValidHier,IdCorrect$observateur_probabilite
40 ,IdCorrect$validateur_probabilite)
41
42
43 #print(paste(length(subset(IdCorrect$ConfV,IdCorrect$ConfV!=""))))
44
45 #Step 2: Get numerictime data
46 if (substr(IdCorrect$`nom du fichier`[1],2,2)=="i") #for car/walk transects
47 {
48 FileInfo=as.data.table(tstrsplit(IdCorrect$`nom du fichier`,"-"))
49 IdCorrect$Session=as.numeric(substr(FileInfo$V4,5,nchar(FileInfo$V4)))
50 TimeSec=as.data.table(tstrsplit(FileInfo$V5,"_"))
51 TimeSec=as.data.frame(TimeSec)
52 if(sum(TimeSec[,(ncol(TimeSec)-1)]!="00000")==0) #to deal with double Kaleidoscope treatments
53 {
54 print("NOMS DE FICHIERS NON CONFORMES")
55 print("Vous les avez probablement traiter 2 fois par Kaleidoscope")
56 stop("Merci de nous signaler cette erreur par mail pour correction")
57 }else{
58 IdCorrect$TimeNum=(IdCorrect$Session*800
59 +as.numeric(TimeSec[,(ncol(TimeSec)-1)])
60 +as.numeric(TimeSec[,(ncol(TimeSec))])/1000)
61 }
62
63 }else{
64 if(substr(IdCorrect$`nom du fichier`[1],2,2)=="a") #for stationary recordings
65 {
66 DateRec=as.POSIXlt(f2p(IdCorrect$`nom du fichier`))
67 Nuit=format(as.Date(DateRec-43200*(DateRec$hour<12)),format="%d/%m/%Y")
68 #Nuit[is.na(Nuit)]=0
69 IdCorrect$Session=Nuit
70 IdCorrect$TimeNum=as.numeric(DateRec)
71
72 }else{
73 print("NOMS DE FICHIERS NON CONFORMES")
74 stop("Ils doivent commencer par Cir (routier/pedestre) ou par Car (points fixes")
75 }
76 }
77
78 #hist(IdCorrect$TimeNum)
79
80
81
82
83 #Step 3 :treat sequentially each species identified by Tadarida-C
84 IdExtrap=vector() #to store the id extrapolated from validations
85 IdC2=IdCorrect[0,] #to store data in the right order
86 TypeE=vector() #to store the type of extrapolation made
87 for (j in 1:nlevels(as.factor(IdCorrect$tadarida_taxon)))
88 {
89 IdSp=subset(IdCorrect
90 ,IdCorrect$tadarida_taxon==levels(as.factor(IdCorrect$tadarida_taxon))[j])
91 if(sum(IdSp$IdV=="")==(nrow(IdSp))) #case 1 : no validation no change
92 {
93 IdC2=rbind(IdC2,IdSp)
94 IdExtrap=c(IdExtrap,rep(IdSp$tadarida_taxon[1],nrow(IdSp)))
95 TypeE=c(TypeE,rep(0,nrow(IdSp)))
96 }else{ #case 2: some validation
97 Vtemp=subset(IdSp,IdSp$IdV!="")
98 #case2A: validations are homogeneous
99 if(nlevels(as.factor(Vtemp$IdV))==1)
100 {
101 IdC2=rbind(IdC2,IdSp)
102 IdExtrap=c(IdExtrap,rep(Vtemp$IdV[1],nrow(IdSp)))
103 TypeE=c(TypeE,rep(2,nrow(IdSp)))
104 }else{
105 #case 2B: validations are heterogeneous
106 #case 2B1: some validations confirms the species identified by Tadarida and highest confidence are confirmed
107 subVT=subset(Vtemp,Vtemp$IdV==levels(as.factor(IdCorrect$tadarida_taxon))[j])
108 subVF=subset(Vtemp,Vtemp$IdV!=levels(as.factor(IdCorrect$tadarida_taxon))[j])
109 if((nrow(subVT)>0)&(max(subVT$IdProb)>max(subVF$IdProb)))
110 {
111 Vtemp=Vtemp[order(Vtemp$IdProb),]
112 test=(Vtemp$IdV!=Vtemp$tadarida_taxon)
113 Fr1=max(which(test == TRUE)) #find the error with highest indices
114 Thr1=mean(Vtemp$IdProb[(Fr1):(Fr1+1)]) #define first threshold as the median confidence between the first error and the confirmed ID right over it
115 #id over this threshold are considered right
116 IdHC=subset(IdSp,IdSp$IdProb>Thr1)
117 IdC2=rbind(IdC2,IdHC)
118 IdExtrap=c(IdExtrap,rep(Vtemp$IdV[nrow(Vtemp)],nrow(IdHC)))
119 TypeE=c(TypeE,rep(2,nrow(IdHC)))
120 #id under this threshold are attributed to validated id closest in time
121 Vtemp=Vtemp[order(Vtemp$TimeNum),]
122 cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
123 CorrV=findInterval(IdSp$TimeNum, cuts)
124 IdE=Vtemp$IdV[CorrV]
125 IdEL=subset(IdE,IdSp$IdProb<=Thr1)
126 IdLC=subset(IdSp,IdSp$IdProb<=Thr1)
127 IdExtrap=c(IdExtrap,IdEL)
128 TypeE=c(TypeE,rep(1,length(IdEL)))
129 IdC2=rbind(IdC2,IdLC)
130
131
132 }else{
133 #case 2B2: all validations concerns errors
134 #id are extrapolated on time only
135 Vtemp=Vtemp[order(Vtemp$TimeNum),]
136 cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
137 CorrV=findInterval(IdSp$TimeNum, cuts)
138 IdE=Vtemp$IdV[CorrV]
139 IdExtrap=c(IdExtrap,IdE)
140 TypeE=c(TypeE,rep(1,length(IdE)))
141 IdC2=rbind(IdC2,IdSp)
142 }
143 }
144
145
146 }
147
148 #print(paste(j,nrow(IdC2),length(IdExtrap)))
149
150 }
151 test1=(nrow(IdC2)==length(IdExtrap))
152 test2=(nrow(IdC2)==nrow(IdCorrect))
153 if((test1==F)|(test2==F))
154 {
155 (stop("Erreur de traitement !!!"))
156 }
157
158 IdC2$IdExtrap=IdExtrap
159 IdC2$TypeE=TypeE
160
161
162 IdC2=IdC2[order(IdC2$IdProb,decreasing=T),]
163 IdC2=IdC2[order(IdC2$ConfV,decreasing=T),]
164 IdC2=IdC2[order(IdC2$`nom du fichier`),]
165 #discard duplicated species within the same files (= false positives corrected by 2nd layer)
166 IdC2=unique(IdC2,by=c("nom du fichier","IdExtrap"))
167
168
169
170 write.table(IdC2,"IdValidTidy.tabular",row.names=F,sep="\t")