comparison rmsk_summary_table_multiple.r @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 62fefa284036
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 #! /usr/bin/env Rscript
2 # repeat masker summary
3 # analysis of *.out file
4 # input arguments: <rmsk_result.out> <reads.fas.
5 # calculates totoal proportion of matched repetetive families and averagee score and print the table
6 suppressPackageStartupMessages(library(getopt))
7
8 # INPUT ARGUMENTS:
9 opt = getopt(matrix(c(
10 'fasta', 'f', 1, "character",
11 'repeat_masker','r', 1, "character",
12 'output', 'o', 1, "character",
13 'help','h',0,'logical'),ncol=4,byrow=T))
14
15 if (!is.null(opt$help)) {
16 cat("Usage:\n\n -f fasta file \n -r repeat masker output\n -o output files\n")
17 stop()
18 }
19
20 RMfiles=system(paste("ls ",opt$repeat_masker,sep=""),intern=T)
21 fasta=system(paste("ls ",opt$fasta,sep=""),intern=T)
22
23
24
25 getsubdir=function(x){
26 xx=gsub(".*/","",x)
27 sdir=gsub(xx,"",x,fixed=T)
28 sdir
29 }
30
31 fasta.summary=function(reads){
32 suppressPackageStartupMessages(library(Biostrings,quiet=T))
33 if (exists("readDNAStringSet")){
34 seqs=readDNAStringSet(reads)
35 }else{
36 seqs=read.DNAStringSet(reads)
37 }
38
39 N=length(seqs)
40 Ls=nchar(seqs)
41 L=sum(Ls)
42 M=median(Ls)
43 A=mean(Ls)
44 output=c(N,L,M,A)
45 names(output)=c("N","length","median","mean")
46 output
47 }
48
49 repCont=function(repname){ # uses external variable!
50 pos=repname==classfamily
51 RC=unname(sum(lengths[pos])/totalLength)
52 RC
53 }
54 repScore=function(repname){ # uses external variable!
55 pos=repname==classfamily
56 S=unname(mean(score[pos]))
57 S
58 }
59
60 N=length(RMfiles)
61
62 for (i in seq_along(RMfiles)){
63 cat(paste((RMfiles[i]),"\n"))
64
65 sdir=getsubdir(RMfiles[i])
66 seqs.summary=fasta.summary(fasta[i])
67 totalLength=seqs.summary["length"]
68
69 total.counts=data.frame(c("All_Reads_Length","All_Reads_Number"),c(NA,NA),c(NA,NA),c(totalLength,seqs.summary['N']),c(totalLength,seqs.summary['N']),c(NA,NA))
70 colnames(total.counts)=c("class/fammily","class","family","hits","content","Mean_Score")
71
72
73 # get RM data if not empty
74 rmsk_empty=length(grep("There were no repetitive sequences detected",scan(RMfiles[i],what=character(),n=1,sep="\n",quiet=T),fixed=T))>0
75 rmsk_notperformed = length(grep("No Repbase used with RepetMasker",scan(RMfiles[i],what=character(),n=1,sep="\n",quiet=T),fixed=T))>0
76 if (!rmsk_empty & !rmsk_notperformed) {
77 rmsk=read.table(RMfiles[i],header=F,as.is=T,skip=2,comment.char="*",fill=T)
78 score=rmsk[,1]
79 Ids=rmsk[,5]
80 starts=rmsk[,6]
81 ends=rmsk[,7]
82 lengths=ends-starts
83 classfamily=rmsk[,11]
84 out.table=table(classfamily)
85 out.table=data.frame(names(out.table),c(out.table),stringsAsFactors=F)
86 contents=sapply(out.table[,1],repCont)*100
87 meanScore=sapply(out.table[,1],repScore)
88 class=gsub("/.*","",out.table[,1])
89 families=gsub(".*/","",out.table[,1])
90 out.table=data.frame(out.table[,1],class,families,out.table[,-1],contents,meanScore,stringsAsFactors=F)
91 colnames(out.table)=c("class/fammily","class","family","hits","content","Mean_Score")
92 out.table=out.table[order(out.table$content,decreasing=T),]
93 out.table=rbind(out.table,total.counts)
94 }else{
95 out.table=total.counts
96 }
97 out.table=out.table[order(out.table$content,decreasing=T),]
98
99 write.table(out.table,file=paste(sdir,opt$output,sep=""),row.names=F,quote=F,sep="\t")
100
101
102 # merge all tables:
103
104 colnames(out.table)=paste(colnames(out.table),c("",rep(sdir,5)))
105
106 if (i==1) {
107 mergedTable=out.table[,c(1,4,5,6)]
108 }else{
109
110 mergedTable=merge(mergedTable,out.table[,c(1,4,5,6)],by=1,all=T)
111 }
112
113 }
114 mergedTable=mergedTable[order(rowSums(mergedTable[,seq(2,N*3,by=3),drop=FALSE],na.rm=T),decreasing=T),]
115 mergedTable[is.na(mergedTable)]<-0
116
117 write.table(t(mergedTable),file=paste(opt$output,"summary.csv",sep=""),col.names=F,row.names=T,quote=F,sep="\t")
118
119 #save.image("tmp.RData")
120
121
122
123
124
125
126
127