diff IdValidTidy.R @ 1:eb19a5089b56 draft default tip

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tools/vigiechiro commit 7ef0e58cbcbf41088e359f00b6c86504c773c271
author ecology
date Fri, 26 Apr 2019 12:21:27 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/IdValidTidy.R	Fri Apr 26 12:21:27 2019 -0400
@@ -0,0 +1,170 @@
+#!/usr/bin/env Rscript
+
+args <- commandArgs(trailingOnly = TRUE)
+
+#print(args)
+
+library(data.table)
+library(methods)
+
+
+ValidHier=function(x,y) #used to write validator id over observer id
+{
+  if(y==""){x}else{y}
+}
+
+f2p <- function(x) #get date-time data from recording file names
+{
+  if (is(x)[1] == "data.frame") {pretemps <- vector(length = nrow(x))}
+  op <- options(digits.secs = 3)
+  pretemps <- paste(substr(x, nchar(x) - 18, nchar(x)-4), ".", substr(x, nchar(x) - 2, nchar(x)), sep = "")
+  strptime(pretemps, "%Y%m%d_%H%M%OS",tz="UTC")-7200
+}
+
+
+IdCorrect=fread(args[1])
+
+#Step 0 :compute id score from 2nd Layer
+IdCorrect$IdProb=IdCorrect$tadarida_probabilite
+
+IdCorrect$observateur_taxon[is.na(IdCorrect$observateur_taxon)]=""
+IdCorrect$observateur_probabilite[is.na(IdCorrect$observateur_probabilite)]=""
+IdCorrect$validateur_taxon[is.na(IdCorrect$validateur_taxon)]=""
+IdCorrect$validateur_probabilite[is.na(IdCorrect$validateur_probabilite)]=""
+
+
+
+#Step 1 :compute id with confidence regarding a hierarchy (validator > observer)
+IdCorrect$IdV=mapply(ValidHier,IdCorrect$observateur_taxon,IdCorrect$validateur_taxon)
+IdCorrect$ConfV=mapply(ValidHier,IdCorrect$observateur_probabilite
+                       ,IdCorrect$validateur_probabilite)
+
+
+#print(paste(length(subset(IdCorrect$ConfV,IdCorrect$ConfV!=""))))
+
+#Step 2: Get numerictime data
+if (substr(IdCorrect$`nom du fichier`[1],2,2)=="i") #for car/walk transects
+{
+  FileInfo=as.data.table(tstrsplit(IdCorrect$`nom du fichier`,"-"))
+  IdCorrect$Session=as.numeric(substr(FileInfo$V4,5,nchar(FileInfo$V4)))
+  TimeSec=as.data.table(tstrsplit(FileInfo$V5,"_"))
+  TimeSec=as.data.frame(TimeSec)
+  if(sum(TimeSec[,(ncol(TimeSec)-1)]!="00000")==0) #to deal with double Kaleidoscope treatments
+  {
+    print("NOMS DE FICHIERS NON CONFORMES")
+    print("Vous les avez probablement traiter 2 fois par Kaleidoscope")
+    stop("Merci de nous signaler cette erreur par mail pour correction")
+      }else{
+        IdCorrect$TimeNum=(IdCorrect$Session*800
+        +as.numeric(TimeSec[,(ncol(TimeSec)-1)])
+        +as.numeric(TimeSec[,(ncol(TimeSec))])/1000)
+  }
+  
+}else{
+  if(substr(IdCorrect$`nom du fichier`[1],2,2)=="a") #for stationary recordings
+  {
+    DateRec=as.POSIXlt(f2p(IdCorrect$`nom du fichier`))
+    Nuit=format(as.Date(DateRec-43200*(DateRec$hour<12)),format="%d/%m/%Y")
+    #Nuit[is.na(Nuit)]=0
+    IdCorrect$Session=Nuit
+    IdCorrect$TimeNum=as.numeric(DateRec)
+    
+    }else{
+      print("NOMS DE FICHIERS NON CONFORMES")
+       stop("Ils doivent commencer par Cir (routier/pedestre) ou par Car (points fixes")
+    }
+}
+
+#hist(IdCorrect$TimeNum)
+
+
+
+
+#Step 3 :treat sequentially each species identified by Tadarida-C
+IdExtrap=vector() #to store the id extrapolated from validations
+IdC2=IdCorrect[0,] #to store data in the right order
+TypeE=vector() #to store the type of extrapolation made
+for (j in 1:nlevels(as.factor(IdCorrect$tadarida_taxon)))
+{
+  IdSp=subset(IdCorrect
+              ,IdCorrect$tadarida_taxon==levels(as.factor(IdCorrect$tadarida_taxon))[j])
+  if(sum(IdSp$IdV=="")==(nrow(IdSp))) #case 1 : no validation no change
+  {
+    IdC2=rbind(IdC2,IdSp)
+    IdExtrap=c(IdExtrap,rep(IdSp$tadarida_taxon[1],nrow(IdSp)))
+    TypeE=c(TypeE,rep(0,nrow(IdSp)))
+  }else{ #case 2: some validation
+    Vtemp=subset(IdSp,IdSp$IdV!="")
+      #case2A: validations are homogeneous
+    if(nlevels(as.factor(Vtemp$IdV))==1)
+    {
+      IdC2=rbind(IdC2,IdSp)
+      IdExtrap=c(IdExtrap,rep(Vtemp$IdV[1],nrow(IdSp)))
+      TypeE=c(TypeE,rep(2,nrow(IdSp)))
+    }else{
+      #case 2B: validations are heterogeneous
+      #case 2B1: some validations confirms the species identified by Tadarida and highest confidence are confirmed
+      subVT=subset(Vtemp,Vtemp$IdV==levels(as.factor(IdCorrect$tadarida_taxon))[j])
+      subVF=subset(Vtemp,Vtemp$IdV!=levels(as.factor(IdCorrect$tadarida_taxon))[j])
+      if((nrow(subVT)>0)&(max(subVT$IdProb)>max(subVF$IdProb)))
+      {
+        Vtemp=Vtemp[order(Vtemp$IdProb),]
+        test=(Vtemp$IdV!=Vtemp$tadarida_taxon)
+        Fr1=max(which(test == TRUE)) #find the error with highest indices
+        Thr1=mean(Vtemp$IdProb[(Fr1):(Fr1+1)]) #define first threshold as the median confidence between the first error and the confirmed ID right over it
+        #id over this threshold are considered right
+        IdHC=subset(IdSp,IdSp$IdProb>Thr1)
+        IdC2=rbind(IdC2,IdHC)
+        IdExtrap=c(IdExtrap,rep(Vtemp$IdV[nrow(Vtemp)],nrow(IdHC)))
+        TypeE=c(TypeE,rep(2,nrow(IdHC)))
+        #id under this threshold are attributed to validated id closest in time
+        Vtemp=Vtemp[order(Vtemp$TimeNum),]
+        cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
+        CorrV=findInterval(IdSp$TimeNum, cuts)
+        IdE=Vtemp$IdV[CorrV]
+        IdEL=subset(IdE,IdSp$IdProb<=Thr1)
+        IdLC=subset(IdSp,IdSp$IdProb<=Thr1)
+        IdExtrap=c(IdExtrap,IdEL)
+        TypeE=c(TypeE,rep(1,length(IdEL)))
+        IdC2=rbind(IdC2,IdLC)
+        
+            
+        }else{
+          #case 2B2: all validations concerns errors
+          #id are extrapolated on time only
+          Vtemp=Vtemp[order(Vtemp$TimeNum),]
+          cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf)
+          CorrV=findInterval(IdSp$TimeNum, cuts)
+          IdE=Vtemp$IdV[CorrV]
+          IdExtrap=c(IdExtrap,IdE)
+          TypeE=c(TypeE,rep(1,length(IdE)))
+          IdC2=rbind(IdC2,IdSp)
+          }
+      }
+    
+    
+  }
+  
+  #print(paste(j,nrow(IdC2),length(IdExtrap)))
+  
+}
+test1=(nrow(IdC2)==length(IdExtrap))
+test2=(nrow(IdC2)==nrow(IdCorrect))
+if((test1==F)|(test2==F))
+{
+  (stop("Erreur de traitement !!!"))
+}
+
+IdC2$IdExtrap=IdExtrap
+IdC2$TypeE=TypeE
+
+
+IdC2=IdC2[order(IdC2$IdProb,decreasing=T),]
+IdC2=IdC2[order(IdC2$ConfV,decreasing=T),]
+IdC2=IdC2[order(IdC2$`nom du fichier`),]
+#discard duplicated species within the same files (= false positives corrected by 2nd layer)
+IdC2=unique(IdC2,by=c("nom du fichier","IdExtrap"))
+
+
+
+write.table(IdC2,"IdValidTidy.tabular",row.names=F,sep="\t")