annotate rmsk_summary_table_multiple.r @ 34:91996b991991 draft default tip

Uploaded
author petr-novak
date Fri, 16 Feb 2024 15:22:21 +0000
parents 62fefa284036
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
1 #! /usr/bin/env Rscript
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
2 # repeat masker summary
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
3 # analysis of *.out file
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
4 # input arguments: <rmsk_result.out> <reads.fas.
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
5 # calculates totoal proportion of matched repetetive families and averagee score and print the table
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
6
14
62fefa284036 Uploaded
petr-novak
parents: 0
diff changeset
7 opt = list()
62fefa284036 Uploaded
petr-novak
parents: 0
diff changeset
8 opt$fasta = commandArgs(T)[1]
62fefa284036 Uploaded
petr-novak
parents: 0
diff changeset
9 opt$repeat_masker = commandArgs(T)[2]
62fefa284036 Uploaded
petr-novak
parents: 0
diff changeset
10 opt$output = commandArgs(T)[3]
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
11
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
12
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
13 RMfiles=system(paste("ls ",opt$repeat_masker,sep=""),intern=T)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
14 fasta=system(paste("ls ",opt$fasta,sep=""),intern=T)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
15
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
16
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
17
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
18 getsubdir=function(x){
14
62fefa284036 Uploaded
petr-novak
parents: 0
diff changeset
19 xx=gsub(".*/","",x)
62fefa284036 Uploaded
petr-novak
parents: 0
diff changeset
20 sdir=gsub(xx,"",x,fixed=T)
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
21 sdir
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
22 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
23
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
24 fasta.summary=function(reads){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
25 suppressPackageStartupMessages(library(Biostrings,quiet=T))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
26 if (exists("readDNAStringSet")){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
27 seqs=readDNAStringSet(reads)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
28 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
29 seqs=read.DNAStringSet(reads)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
30 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
31
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
32 N=length(seqs)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
33 Ls=nchar(seqs)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
34 L=sum(Ls)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
35 M=median(Ls)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
36 A=mean(Ls)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
37 output=c(N,L,M,A)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
38 names(output)=c("N","length","median","mean")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
39 output
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
40 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
41
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
42 repCont=function(repname){ # uses external variable!
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
43 pos=repname==classfamily
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
44 RC=unname(sum(lengths[pos])/totalLength)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
45 RC
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
46 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
47 repScore=function(repname){ # uses external variable!
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
48 pos=repname==classfamily
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
49 S=unname(mean(score[pos]))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
50 S
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
51 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
52
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
53 N=length(RMfiles)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
54
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
55 for (i in seq_along(RMfiles)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
56 cat(paste((RMfiles[i]),"\n"))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
57
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
58 sdir=getsubdir(RMfiles[i])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
59 seqs.summary=fasta.summary(fasta[i])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
60 totalLength=seqs.summary["length"]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
61
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
62 total.counts=data.frame(c("All_Reads_Length","All_Reads_Number"),c(NA,NA),c(NA,NA),c(totalLength,seqs.summary['N']),c(totalLength,seqs.summary['N']),c(NA,NA))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
63 colnames(total.counts)=c("class/fammily","class","family","hits","content","Mean_Score")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
64
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
65
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
66 # get RM data if not empty
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
67 rmsk_empty=length(grep("There were no repetitive sequences detected",scan(RMfiles[i],what=character(),n=1,sep="\n",quiet=T),fixed=T))>0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
68 rmsk_notperformed = length(grep("No Repbase used with RepetMasker",scan(RMfiles[i],what=character(),n=1,sep="\n",quiet=T),fixed=T))>0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
69 if (!rmsk_empty & !rmsk_notperformed) {
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
70 rmsk=read.table(RMfiles[i],header=F,as.is=T,skip=2,comment.char="*",fill=T)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
71 score=rmsk[,1]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
72 Ids=rmsk[,5]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
73 starts=rmsk[,6]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
74 ends=rmsk[,7]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
75 lengths=ends-starts
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
76 classfamily=rmsk[,11]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
77 out.table=table(classfamily)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
78 out.table=data.frame(names(out.table),c(out.table),stringsAsFactors=F)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
79 contents=sapply(out.table[,1],repCont)*100
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
80 meanScore=sapply(out.table[,1],repScore)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
81 class=gsub("/.*","",out.table[,1])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
82 families=gsub(".*/","",out.table[,1])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
83 out.table=data.frame(out.table[,1],class,families,out.table[,-1],contents,meanScore,stringsAsFactors=F)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
84 colnames(out.table)=c("class/fammily","class","family","hits","content","Mean_Score")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
85 out.table=out.table[order(out.table$content,decreasing=T),]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
86 out.table=rbind(out.table,total.counts)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
87 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
88 out.table=total.counts
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
89 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
90 out.table=out.table[order(out.table$content,decreasing=T),]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
91
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
92 write.table(out.table,file=paste(sdir,opt$output,sep=""),row.names=F,quote=F,sep="\t")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
93
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
94
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
95 # merge all tables:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
96
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
97 colnames(out.table)=paste(colnames(out.table),c("",rep(sdir,5)))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
98
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
99 if (i==1) {
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
100 mergedTable=out.table[,c(1,4,5,6)]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
101 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
102
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
103 mergedTable=merge(mergedTable,out.table[,c(1,4,5,6)],by=1,all=T)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
104 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
105
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
106 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
107 mergedTable=mergedTable[order(rowSums(mergedTable[,seq(2,N*3,by=3),drop=FALSE],na.rm=T),decreasing=T),]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
108 mergedTable[is.na(mergedTable)]<-0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
109
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
110 write.table(t(mergedTable),file=paste(opt$output,"summary.csv",sep=""),col.names=F,row.names=T,quote=F,sep="\t")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
111
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
112 #save.image("tmp.RData")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
113
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
114
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
115
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
116
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
117
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
118
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
119
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
120