sampleFasta.xml
cluster_table2krona_format.xml extract_files_from_re_archive.xml plot_comparative_clustering_summary.R plot_comparative_clustering_summary.xml
ChipSeqRatioReport README.html fasta_tmp_single tmp.RData tmp/.dummy |
@@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +''' +take various inputs and convert it to krona tabular format for visualization +supported inputs: +- DANTE gff3 +- TODO PROFREP gff3 +- TODO RE archive - normal run +- TODO RE archive - comparative +- +''' +import argparse +import re +import collections + + +def parse_dante_gff(f): + '''load gff3 file and return classification with counts''' + r = re.compile("Final_Classification=") + cls_count = collections.defaultdict(int) + for line in f: + if re.match("#", line.strip()): + continue + attributes = line.split("\t")[8].split(";") + cls_raw = list(filter(r.match, attributes))[0] + cls = re.sub(r, "",cls_raw) + cls_count[cls] += 1 + + return cls_count + + +def export_classification(cls, f): + '''save classification to tab delimited file''' + for i in cls: + f.write('{}\t{}\n'.format(cls[i], i.replace("|","\t"))) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--format', choices=['dante', 'profrep', 're']) + parser.add_argument('-i', '--input', type=argparse.FileType('r')) + parser.add_argument('-o', '--output', type=argparse.FileType('w')) + + args = parser.parse_args() + + if args.format == "dante": + classification = parse_dante_gff(args.input) + + if args.format in ["profrep" 're']: + print("Not implemented") + exit(0) + + export_classification(classification, args.output) |
@@ -0,0 +1,44 @@ +#!/usr/bin/env python +import sys +import re +from collections import defaultdict +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") +parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name") +parser.add_argument("-m", "--use_manual", action='store_true', default=False) + +args = parser.parse_args() + +column = 6 if args.use_manual else 4 + + +header = False +clust_info = {} +counts = defaultdict(lambda: 0) +top_clusters = 0 +with open(, 'r') as f: + for l in f: + parts = l.split() + if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l): + print("header detected") + header = True + continue + if header: + classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') + counts[classification] += int(parts[3]) + top_clusters += int(parts[3]) + + elif len(parts) >= 2: + clust_info[parts[0].replace('"', '')] = int(parts[1]) + +counts['Singlets'] = clust_info['Number_of_singlets'] +counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters + +with open(, 'w') as fout: + for cls_line, nreads in counts.items(): + fout.write(str(nreads) +"\t" + cls_line + "\n") + + + |
@@ -0,0 +1,45 @@ +<tool id="cluster_table2krona_format" name="Convert RepeatExplorer2 CLUSTER_table.csv to Krona formatted input " version="1.0.0" python_template_version="3.5"> + <command detect_errors="exit_code"><![CDATA[ + $__tool_directory__/ --input ${input} --output ${output} + #if $column == "Final_annotation" + -m + #end if + ]]></command> + <inputs> + <param type="data" name="input" format="txt" label="CLUSTER_table.csv" /> + <param name="column" type="select" label="What annotation column do you want to include in the output?"> + <option value="Final_annotation" >Final_annotation </option> + <option value="Automatic_annotation" selected="true" >Automatic_annotation </option> + </param> + </inputs> + <outputs> + <data format="tabular" name="output" label="RepeatExplorer cluster annotation formatted for Krona visualization from data ${input.hid}"/> + </outputs>/ + <help><![CDATA[ + This tool converts CLUSTER_table.csv RepeatExplorer2 output to file which can be visualized with Krona. As input use CLUSTER_table.csv obtained from RepeatExplorer2 analysis. Example of CLUSTER_table.csv:: + + + '"Number_of_reads_in_clusters" 3002 ' + '"Number_of_clusters" 895 ' + '"Number_of_superclusters" 895 ' + '"Number_of_singlets" 6998 ' + '"Number_of_analyzed_reads" 10000 ' + '"Cluster" "Supercluster" "Size" "Size_adjusted" "Automatic_annotation" "TAREAN_annotation" "Final_annotation"' + '1 1 61 61 "All" "Other" ""' + '2 2 59 59 "All/repeat/satellite" "Putative satellites (high confidence)" ""' + '3 3 45 45 "All/repeat/satellite" "Putative satellites (low confidence)" ""' + '4 4 38 38 "All" "Other" ""' + '5 5 32 32 "All" "Other" ""' + '6 6 28 28 "All" "Other" ""' + '7 7 25 25 "All" "Other" ""' + '8 8 24 24 "All" "Other" ""' + '9 9 23 23 "All" "Other" ""' + '10 10 22 22 "All/repeat/mobile_element/Class_I/LTR/Ty3_gypsy/non-chromovirus/OTA/Tat/Ogre" "Other" ""' + '11 11 20 20 "All" "Other" ""' + + + +Last column "Final_annotation" is intended to be filled manually based on the curation of the automatic anotation results. If you obtain CLUSTER_table.csv directly from RepeatExplorer2 output, you can convert only automatic annotation table. + + ]]></help> +</tool> |
@@ -0,0 +1,49 @@ +<tool id="extract_var_files_from_re" name="Extract various files from RepeatExplorer2 archive"> + <command detect_errors="exit_code"> + + #for $sf in $file: + + #if $sf == "CLUSTER_TABLE.csv" + unzip -p -j ${RepeatExplorer_archive} ${sf} > ${cluster_table} + ; + #end if + + #if $sf == "COMPARATIVE_ANALYSIS_COUNTS.csv" + unzip -p -j ${RepeatExplorer_archive} ${sf} > ${comparative_analysis_count} + ; + #end if + + #if $sf == "SUPERCLUSTER_TABLE.csv" + unzip -p -j ${RepeatExplorer_archive} ${sf} > ${supercluster_table} + ; + #end if + + #end for + + + + </command> + + <inputs> + <param name="RepeatExplorer_archive" label="Archive with RepeatExplorer2 results" type="data" format="zip"/> + + <param name="file" type="select" label="select files you want to extract" multiple="true" optional="false"> + <option value="CLUSTER_TABLE.csv">CLUSTER_TABLE.csv</option> + <option value="COMPARATIVE_ANALYSIS_COUNTS.csv">COMPARATIVE_ANALYSIS_COUNTS.csv</option> + <option value="SUPERCLUSTER_TABLE.csv">SUPERCLUSTER_TABLE.csv</option> + </param> + </inputs> + + <outputs> + <data format="tabular" name="cluster_table" label="CLUSTER_TABLE.csv from ${RepeatExplorer_archive.hid}" > + <filter>"CLUSTER_TABLE.csv" in file</filter> + </data> + <data format="tabular" name="supercluster_table" label="SUPERCLUSTER_TABLE.csv from ${RepeatExplorer_archive.hid}"> + <filter>"SUPERCLUSTER_TABLE.csv" in file</filter> + </data> + <data format="tabular" name="comparative_analysis_count" label="COMPARATIVE_ANALYSIS_COUNTS.csv from ${RepeatExplorer_archive.hid}"> + <filter>"COMPARATIVE_ANALYSIS_COUNTS.csv" in file</filter> + </data> + </outputs> +</tool> + |
b'@@ -0,0 +1,304 @@\n+#!/usr/bin/env Rscript\n+library(optparse)\n+## TODO - add scale to legend!\n+twenty_colors = c(\n+ \'#e6194b\', \'#3cb44b\', \'#ffe119\', \'#4363d8\', \'#f58231\',\n+ \'#911eb4\', \'#46f0f0\', \'#f032e6\', \'#bcf60c\', \'#fabebe\',\n+ \'#008080\', \'#e6beff\', \'#9a6324\', \'#fffac8\', \'#800000\',\n+ \'#aaffc3\', \'#808000\', \'#ffd8b1\', \'#000075\', "#000000"\n+)\n+\n+get_color = function(classification, size){\n+ ## 20 of unique colors, first is black\n+ unique_colors = twenty_colors[1:opt$number_of_colors]\n+ Ncol = length(unique_colors)\n+ ## rest wil be grey:\n+ grey_color = "#a9a9a9"\n+ ## unique repeats without All\n+ include = !classification %in% "All"\n+ unique_repeats = names(c(sort(by(size[include], INDICES = classification[include], FUN = sum), decreasing = TRUE)))\n+ color_table = unique_colors[1:min(Ncol,length(unique_repeats))]\n+ names(color_table) = unique_repeats[1:min(Ncol,length(unique_repeats))]\n+ color = rep(grey_color, length(classification))\n+ names(color) = classification\n+ for (ac in names(color_table)){\n+ color[names(color) %in% ac] = color_table[ac]\n+ }\n+ return(color)\n+}\n+\n+\n+make_legend = function(color){\n+ ## simplify description:\n+ names(color) = gsub(".+/","",names(color))\n+ description = sapply(split(names(color), color), function(x) paste(unique(x), collapse=";"))\n+ description = gsub(".+;.+", "Other", description)\n+ description = gsub("All", "Other", description)\n+ if ("Other" %in% description & length(description) > 1){\n+ description = c(description[! description %in% "Other"], description[description %in% "Other"])\n+ }\n+ ord = order(factor(names(description), levels = twenty_colors))\n+ legend_info = list(name = gsub("All", "NA", description)[ord], color = names(description)[ord])\n+}\n+\n+plot_rect_map = function(read_counts,cluster_annotation, output_file,GS, RL, Xcoef=1,Ycoef=1){\n+ ## read_counts : correspond to COMPARATIVE_ANALYSIS_COUNTS.csv\n+ ## cluster annotation : CLUSTER_TABLE.csv\n+ counts = read.table(read_counts,header=TRUE,\n+ input_read_counts = unlist(read.table(read_counts, nrows = 1, comment.char = "",sep="\\t")[-(1:2)])\n+\n+ counts_file_valid = ncol(counts) == (length(input_read_counts) + 2) & all(colnames(input_read_counts)[1:2]==c("cluster", "supercluster"))\n+ ## find which line is header\n+ header_line = grep(".*Cluster.*Supercluster.*Size", readLines(cluster_annotation))\n+ annot = read.table(cluster_annotation, sep="\\t",header=TRUE,, skip = header_line - 1)\n+ ## validate\n+ annot_file_valid = all(colnames(annot)==c("Cluster","Supercluster","Size","Size_adjusted","Automatic_annotation","TAREAN_annotation","Final_annotation"))\n+\n+\n+ if (!annot_file_valid | !counts_file_valid){\n+ pdf(output_file)\n+\n+ text(0.5,0.5,"Input is not valid, check input files!")\n+\n+ stop("Input files are not valid!")\n+ }\n+ print(annot_file_valid)\n+ print(counts_file_valid)\n+ ## remove counts which are not in annotation - only clusters in annot will be plotted!\n+ counts = counts[annot$Cluster,]\n+ N = nrow(annot)\n+\n+ counts_automatic = counts\n+ annot_automatic = annot\n+ input_read_counts_automatic = input_read_counts\n+ # remove organelar and contamination if required make count correction\n+ if (opt$nuclear_only){\n+ exclude=grep("contamination|organelle",annot$Automatic_annotation)\n+ if (length(exclude)>0){\n+ counts_automatic = counts[-exclude, , drop=FALSE]\n+ annot_automatic = annot[-exclude, ,drop=FALSE]\n+ input_read_counts_automatic = input_read_counts - colSums(counts[exclude,-c(1:2) , drop=FALSE])\n+ }\n+ }\n+ color_auto = get_color(annot_automatic$Automatic_annotation, annot_automatic$Size)\n+\n+ legend_info = make_legend(color_auto)\n+ params = list(Automatic_annotation = list(\n+ color = color_auto,\n+ legend = legend_info,\n+ counts = counts_automatic,\n+ annot = annot_automatic,\n+ input_read_counts = input_read_counts_'..b'seq(0,1, length.out = nrow(Mn3))\n+ rectMap(Mn3scale,\'none\',col="grey", grid=FALSE, boxlab="", draw_box=FALSE, center=FALSE)\n+ slabels = pretty(c(0,MaxGS), n = 10)\n+ sat = slabels/MaxGS * nrow(Mn3scale)\n+ axis(side=1, at= sat, labels = slabels, line = 0)\n+ mtext(side = 1, text = "Repeat abundance", las=1, line=2.5,cex=0.4)\n+ mtext(side = 2, text = "Rectangle\\n height", las=1, line=2,cex=0.4, at=1)\n+\n+ axis(2, at=c(0.5, 1, 1.5), labels=c(0,0.5,1),line=0)\n+ }\n+ st =\n+}\n+\n+rectMap=function(x,\'row\',col=1,xlab="",ylab="",grid=TRUE,axis_pos=c(1,4),boxlab = "Cluster Id", cexx=NULL,cexy=NULL, draw_box=TRUE, center=TRUE){\n+ if (\'row\'){\n+ #x=(x)/rowSums(x)\n+ x=(x)/apply(x,1,sum)\n+ }\n+ if (\'column\'){\n+ x=t(t(x)/apply(x,2,max))\n+ }\n+ nc=ncol(x)\n+ nr=nrow(x)\n+ coords=expand.grid(1:nr,1:nc)\n+ plot(coords[,1],coords[,2],type=\'n\',axes=F,xlim=range(coords[,1])+c(-.5,.5),ylim=range(coords[,2])+c(-.5,.5),xlab=xlab,ylab=ylab)\n+ axis(axis_pos[1],at=1:nr,labels=rownames(x),lty=0,tick=FALSE,line=0,cex.axis=0.5/log10(nr))\n+ axis(axis_pos[2],at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=0, cex.axis=0.7)\n+ axis(2,at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=1, cex.axis=0.7)\n+\n+ mtext(side = 1, boxlab, las=1, line = 3, cex = 0.5)\n+ line = 1.5 + log10(nr)\n+ #mtext(side = 2, "Proportions of individual samples", las =0, line = line, cex = 0.5)\n+ s=x/2\n+ w = c(x)/2\n+ if(center){\n+ rect(coords[,1]-0.5,coords[,2]-s,coords[,1]+0.5,coords[,2]+s,col=col,border=NA)\n+ }else{\n+ rect(coords[,1]-0.5,coords[,2]-0.5,coords[,1]+0.5,coords[,2]+x-0.5,col=col,border=NA)\n+ }\n+ if (grid){\n+ abline(v=0:(nr)+.5,h=0:(nc)+.5,lty=2,col="#60606030",lwd=0.2)\n+ }\n+ if(draw_box){\n+ box(col="#60606030",lty=2, lwd=0.2)\n+ }\n+}\n+\n+ option_list <- list( \n+ make_option(c("-c", "--cluster_table"), default=NA, type = "character",\n+ help="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv"),\n+\n+ make_option(c("-m", "--comparative_counts"),default = NA,type = "character",\n+ help="file from RepeatExplorer2 output - COMPARATIVE_ANALYSIS_COUNTS.csv"),\n+\n+ make_option(c("-o", "--output"), type="character",\n+ default="comparative_analysis_summary.pdf",\n+ help="File name for output figures (pdf document)"),\n+ make_option(c("-N", "--number_of_colors"), type="integer", default=10,\n+ help="Number of unique colors used from plotting (2-20, default is 10)"),\n+\n+ make_option(c("-g", "--genome_size"),default = NA,type = "character",\n+ help="file from genome sizes of species provided in tab delimited file in the format:\n+\n+ species_code1 GenomeSize1\n+ species_code2 GenomeSize2\n+ species_code3 GenomeSize3\n+ species_code4 GenomeSize4\n+\n+ provide the same codes for species as in file COMPARATIVE_ANALYSIS_COUNTS.csv. The use of genome\n+ sizes file imply the --nuclear_only option. If genome sizes are used, genomic abundance scale is added.\n+ "),\n+ make_option(c("-n", "--nuclear_only"), default = FALSE, type="logical",\n+ action = "store_true",\n+ help="remove all non-nuclear sequences (organelle and contamination). ")\n+)\n+\n+\n+opt = parse_args(OptionParser(option_list = option_list))\n+\n+if (any($cluster_table, opt$comparative_counts)))){\n+ message("\\nBoth files: CLUSTER_TABLE.csv and COMPARATIVE_ANALYSIS_COUNTS.csv must be provided\\n")\n+ q()\n+}\n+\n+if (!opt$number_of_colors %in% 1:20){\n+ message("number of color must be in range 1..20")\n+ stop()\n+}\n+\n+if (!$genome_size)){\n+ GS = read.table(opt$genome_size, header=FALSE,, row.names = 1)\n+ opt$nuclear_only=TRUE\n+}else{\n+ GS = NA\n+ RL = NA\n+}\n+\n+plot_rect_map(opt$comparative_counts, opt$cluster_table, opt$output, GS, RL)\n+\n' |
@@ -0,0 +1,71 @@ +<tool id="plot_comparative" name="Visualization of comparative clustering" version="1.0.0"> + <description> Simple utility to create visualization of RepeatExplorer conmparative analysis</description> + <requirements> + <requirement type="package">r-optparse</requirement> + </requirements> + + <command interpreter="Rscript" detect_errors="exit_code" > + $__tool_directory__/plot_comparative_clustering_summary.R + --cluster_table=$cluster_table + --comparative_counts=$counts + --number_of_colors=$number_of_colors + --output=$outpdf + $nuclear_only + + #if $normalization.use_genome_size: + --genome_size $normalization.genome_size_table + #end if + </command> + + <inputs> + <param format="txt" type="data" name="cluster_table" label="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv"/> + <param format="txt" type="data" name="counts" label="file from RepeatExplorer2 output - COMPARATIVE_ANALYSIS_COUNTS.csv"/> + <param value="10" min="2" max="20" type="integer" name="number_of_colors" label="Maximum number of color used for plottting"/> + <param value="false" type="boolean" truevalue="--nuclear_only" falsevalue="" name="nuclear_only" label="Remove all non-nuclear sequences (organel and contamination)"/> + <conditional name="normalization"> + <param name="use_genome_size" type="boolean" checked="False" label="Normalize to genome size" help="Note that if this option is used, non-nuclear sequences are always removed."/> + <when value="false"> + <!-- pass --> + </when> + <when value="true"> + <param name="genome_size_table" type="data" format="txt" label="table with genome sizes"/> + + </when> + + </conditional> + </inputs> + + <outputs> + <data format="pdf" name="outpdf" label="Comparative analysis summary"/> + </outputs> + <help> + **Visualization of comparative clustering** + Visualization can be created two output files from RepeatExplorer pipeline. + + Input file CLUSTER_TABLE.csv contains automatic annotation, information about cluster sizes and the total number of reads used for analysis + Example of CLUSTER_TABLE.csv: :: + + "Number_of_reads_in_clusters" 3002 + "Number_of_clusters" 895 + "Number_of_superclusters" 895 + "Number_of_singlets" 6998 + + "Number_of_analyzed_reads" 10000 + + "Cluster" "Supercluster" "Size" "Size_adjusted" "Automatic_annotation" "TAREAN_classification" "Final_annotation" + 1 1 61 61 "All" "Other" + 2 2 59 59 "All/repeat/satellite" "Putative satellites (high confidence)" + 3 3 45 45 "All/repeat/satellite" "Putative satellites (low confidence)" + 4 4 38 38 "All" "Other" + 5 5 32 32 "All" "Other" + 6 6 28 28 "All" "Other" + 7 7 25 25 "All" "Other" + 8 8 24 24 "All" "Other" + 9 9 23 23 "All" "Other" + 10 10 22 22 "All/repeat/mobile_element/Class_I/LTR/Ty3_gypsy/non-chromovirus/OTA/Tat/Ogre" "Other" + 11 11 20 20 "All" "Other" + 12 12 20 20 "All" "Other" + + + </help> +</tool> |
@@ -1,5 +1,5 @@ <tool id="sampler" name="Read sampling" version="1.0.1"> - <description> Tool for random sampling subsets of reads from larger dataset</description> + <description> Tool for randomly sampling subsets of reads from large datasets</description> <requirements> <requirement type="package">seqkit</requirement> </requirements> @@ -40,8 +40,9 @@ <help> **What it does** - This tools is intended to create sample of sequences from by taking 'random' sample from larger data sets. - Using a same seed parameter make sampling reproducible. + This tools randomly samples the specified number of reads from larger datasets. + Using the same random number generator seed with the same dataset results in sampling the same set of reads, while + using different seeds generates different subsets of reads. </help> |
