Mercurial > repos > mvdbeek > damidseq_polii_gene_call
changeset 0:1b5bd3b955ed draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_polII_gene_call commit 2e97fdaeafdafe2c63588de17777cd32a798adb1-dirty
author | mvdbeek |
---|---|
date | Fri, 20 Apr 2018 10:27:20 -0400 |
parents | |
children | 83ca801f21f9 |
files | damid_polII_gene_call.xml polii.gene.call test-data/details.csv test-data/genes.csv test-data/genes.gff test-data/polII.bed |
diffstat | 6 files changed, 565 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/damid_polII_gene_call.xml Fri Apr 20 10:27:20 2018 -0400 @@ -0,0 +1,50 @@ +<tool id="damidseq_polII_gene_call" name="Calculate polII occupancy" version="1.0.2"> + <description>from DamIDseq data</description> + <requirements> + <requirement type="package" version="3.4">R</requirement> + </requirements> + <version_command><![CDATA[Rscript '$__tool_directory__/polii.gene.call' | grep -o "v.*"]]></version_command> + <command detect_errors="aggressive"><![CDATA[ +ln -fs '$genes' genes.gff && +ln -fs '$input_file' input.$input_file.ext && +Rscript '$__tool_directory__/polii.gene.call' --genes.file=genes.gff input.$input_file.ext --iter=$iter --fdr=$fdr && +mv *.details.csv details.csv && +mv *.genes genes.csv + ]]></command> + <inputs> + <param name="input_file" type="data" format="gff,bed,bedgraph" label="Select dam-fusion/dam ratio files" help="You can use damidseq_core to produce this file."/> + <param name="genes" type="data" format="gff" label="Select a GFF file containing genes for which to calculate occupancy"/> + <param name="iter" type="integer" min="1" value="50000" label="Number of iterations for calculation of FDR"/> + <param name="fdr" type="float" min="0" max="1" value="0.01" label="FDR cutoff for gene list"/> + </inputs> + <outputs> + <data name="details" format="csv" from_work_dir="details.csv" label="${tool.name} gene details on ${on_string}"/> + <data name="sig_genes" format="csv" from_work_dir="genes.csv" label="${tool.name} genes on ${on_string}"/> + </outputs> + <tests> + <test> + <param name="input_file" value="polII.bed" ftype="bedgraph"/> + <param name="genes" value="genes.gff" ftype="gff"/> + <param name="fdr" value="1"/> + <param name="iter" value="500"/> + <output name="details" value="details.csv" compare="sim_size"/> + <output name="sig_genes" value="genes.csv" compare="sim_size"/> + </test> + </tests> + <help><![CDATA[ +What it does +------------ + +Rscript for calculating average PolII occupancy and FDR for RNA Pol II DamID datasets, based on original algorithms developed by Tony Southall. +The script processes datafiles in gatc.bedgraph or gatc.gff format, such as those generated by damidseq_pipeline. + +Differences between RNA pol II DamID and RNAseq +----------------------------------------------- +Although both are methods for transcriptional profiling, please be aware that there may be differences between these two methods. In particular, transcript abundancy as assessed through RNAseq will depend on transcript stability, whereas RNA pol II occupancy may provide a better indication of transcription levels. + + + ]]></help> + <citations> + <citation type="doi">10.1093/bioinformatics/btv386</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/polii.gene.call Fri Apr 20 10:27:20 2018 -0400 @@ -0,0 +1,372 @@ +#!/usr/bin/env Rscript + +# polii.gene.call.r +# Copyright © 2014-15, Owen Marshall + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at +# your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +# USA + +### FDR calcs ### +# Method based on original perl scripts by Tony Southall (TDS) as published in +# Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020 +# +# Significant modifications to the original methodology include: +# * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data +# * using a linear regression for the final intercept value rather than using the average intercept value for all conditions +# -- both of these should increase the accuracy of the final FDR value. + +version <- "1.0.2" +cat(paste("polii.gene.call v",version,"\n", sep="")) + +library(tools) + +### Read CLI options +input.args <- commandArgs(trailingOnly = TRUE) + +in.files <- vector() +read.ops <- function (x) { + for (op in x) { + if (any(grepl("^--",op))) { + op <- gsub("^--","",op) + y <- unlist(strsplit(op,"=")) + + if (y[1] == "help") { + cat(paste("Usage: Rscript polii.gene.call.r --genes.file=[some_genes_file.gff] [list of .gatc.bedgraph or .gatc.gff ratio files to process]\n\n", sep="")) + + cat("Options:\n") + for (n in names(op.args)) { + cat(paste(" ",n,"=",op.args[[n]],"\n",sep="")) + } + cat("\n") + quit("no",1) + } + + if (!is.null(op.args[[ y[1] ]])) { + op.args[[ y[1] ]] <<- y[2] + } else { + cat("Error: Option",y[1],"not recognised ...\n") + quit("no",1) + } + } else { + in.files <<- c(in.files,op) + } + } +} + +write.ops <- function () { + out.df <- data.frame() + for (n in names(op.args)) { + v <<- as.character(op.args[[n]]) + df.line <- data.frame( + option=n, + value=v + ) + out.df <- rbind(out.df, df.line) + } + write.table(out.df,"input.args.single.txt",row.names=F) +} + +op.args <- list( + "genes.file" = "/mnt/data/Genomes/dmel_release/DmR6/DmR6.genes.gff", + "iter" = 50000, + "fdr" = 0.01 +) + +read.ops(input.args) + +if (length(in.files) == 0) { + cat("Usage: Rscript polii.gene.call.r [list of .gatc.gff ratio files to process]\n\n") + quit("no",1) +} + +write.ops() + +### save random seed for future reproducibility +dump.random <- runif(1) +my.seed <- .Random.seed +write.table(my.seed,".randomseed") + +### read genes file +cat("Reading genes data file ...\n") +genes.file=op.args[["genes.file"]] +genes <- read.table(genes.file, comment.char="#", sep="\t", quote="", fill=T) +names(genes) <- c('chr','source','type','start','end','score','strand','c','details') + +# only subset if there is a type termed "gene" +if (any(genes$type == 'gene')) { + genes <- subset(genes, type=='gene') +} + +genes$name <- sapply(genes$details, FUN = function (x) {regmatches(x,gregexpr("(?<=Name=).*?(?=;)", x, perl=T))} ) +genes <- genes[,c('chr','start','end','strand','name')] +genes$chr <- gsub("^chr","",genes$chr,perl=T) + +if (nrow(genes) == 0) { + cat("Error: unable to extract gene information from genes file\n\n") + quit("no",1) +} + +### functions +read.gff <- function (x,name="score") { + fn.ext <- file_ext(x) + + if (grepl("gff",ignore.case=T,fn.ext)) { + temp.data <- read.table(x,row.names=NULL) + if (ncol(temp.data) > 5) { + # GFF + trim.data <- temp.data[,c(1,4,5,6)] + } else { + cat("Error: file does not appear to be in GFF format\n\n") + quit("no",1) + } + } else if (grepl("bed",ignore.case=T,fn.ext)) { + temp.data <- read.table(x,row.names=NULL,skip=1) + if (ncol(temp.data) == 4) { + # bedgraph + trim.data <- temp.data + } else { + cat("Error: file does not appear to be in bedGraph format\n\n") + quit("no",1) + } + } else { + cat("Error: input file does not appear to be in bedGraph or GFF format ...\n\n") + quit("no",1) + } + + names(trim.data) <- c("chr","start","end",name) + + trim.data$chr <- gsub("^chr","",trim.data$chr,perl=T) + + return(trim.data) +} + +gene.exp <- function (input.df, buffer=0, iter=50000, debug=F) { + + avg.exp <- data.frame(input.df[1,c(4:(length(names(input.df))))]) + avg <- vector(length=(length(names(input.df)) - 4)) + avg.exp <- avg.exp[0,] + + ### FDR calcs ### + # Method based off perl scripts by Tony Southall (TDS) as published in + # Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020 + # + # Significant modifications to the original methodology include: + # * taking a linear regression of log data rather than trial-and-error curve fitting of non-log data + # * using a linear regression for the final intercept value rather than using the average intercept value for all conditions + # -- both of these should increase the accuracy of the final FDR value. + + input.len <- length(input.df[,1]) + frag.samp <- c(1,2,3,4,6,8,10,12,15) + thres.samp <- c(0.1,0.2,0.3,0.4,0.5,0.65,0.8,1.0,1.5,2.0) + + rand <- list() + + for (thres in thres.samp) { + + cat(paste(" Calculating FDR for threshold",thres,"\n",sep=" ")) + + # init vars + for (f in frag.samp) { + # List names are, e.g. frag 1, thres 0.2: rand[[thres.1.0.2]] + rand[[paste("thres.",f,".",thres,sep="")]] <- 0; + } + + for (i in 1:iter) { + if (i %% 200 == 0) {cat(paste(" iter",i,"\r"))} + # get random sample for different fragment lengths + + rand.samp <- list() + + for (f in frag.samp) { + # Using the fourth column as we're only calculating FDR for one sample ... + rand.samp[[paste("rand.",f,sep="")]] <- mean(input.df[runif(f,1,input.len),4]) + } + + # count number of times exp > thres + for (f in frag.samp) { + if (rand.samp[[paste("rand.",f,sep="")]] > thres) {rand[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]] + 1} + } + } + } + + rand.fdr <- list() + + for (thres in thres.samp) { + for (f in frag.samp) { + rand.fdr[[paste("thres.",f,".",thres,sep="")]] <- rand[[paste("thres.",f,".",thres,sep="")]]/iter + } + } + + cat("Fitting curves ...\n") + + # curve fit: fdr vs thresholds + var.thres <- list() + + for (thres in thres.samp) { + for (f in frag.samp) { + var.thres[[paste("frags.",f,sep="")]] <- append(var.thres[[paste("frags.",f,sep="")]], rand.fdr[[paste("thres.",f,".",thres,sep="")]]) + } + } + + inf.log.lm <- function (v) { + non.inf <- log(v) != -Inf + ret <- lm(log(v)[non.inf] ~ thres.samp[non.inf]) + return(ret) + } + + # The relationship is exponential, so we need log data for a linear regression + # (in R, linear regression is: y = lm$coefficients[[2]]x + lm$coefficients[[1]] ... ) + var.lm <- list() + for (f in frag.samp) { + var.lm[[paste("frags.",f,sep="")]] <- inf.log.lm(var.thres[[paste("frags.",f,sep="")]]) + } + + # ... and now we do a linear regression on the slopes and intercepts of our previous regressions + # (This is the clever bit, and it actually seems to work. The correlation of slope to fragment size is linear ... + # By doing this on the slope and intercept, we can now predict the FDR for any number of fragments with any expression.) + slope <- vector() + for (f in frag.samp) { + slope <- append(slope, var.lm[[paste("frags.",f,sep="")]]$coefficients[[2]]) + } + + # slope regression predicts the average slope + slope.lm <- lm(slope ~ frag.samp) + + # TDS used an average intercept value for the intercept, however ... + inter <- vector() + for (f in frag.samp) { + inter <- append(inter, var.lm[[paste("frags.",f,sep="")]]$coefficients[[1]]) + } + + # ... there's actually quite a bit of variation of the intercept with real data, + # so we're going to do a lin regression on the intercept instead. + # + # (I'm not convinced it's a true linear relationship. But it's close to linear, + # and will certainly perform better than taking the mean intercept ...) + # + # If you're interested, set the debug flag to TRUE and take a look at the plots generated below ... + inter.lm <- lm(inter ~ frag.samp) + + if (debug == T) { + # plots for debugging/checking + plot.debug <- function (y,x,l,name="Debug plot") { + plot(y ~ x) + abline(l) + + lsum <- summary(l) + r2 <- lsum$r.squared + + legend("topleft",legend=r2,bty='n') + title(name) + + dev.copy(png,paste(name,".png",sep=""));dev.off() + } + + plot.debug(slope,frag.samp,slope.lm,name="correlation of slope") + plot.debug(inter,frag.samp,inter.lm,name="correlation of intercepts") + } + + # ok, so putting that all together ... + fdr <- function (frags, expr) { + inter.test <- inter.lm$coefficients[[2]] * frags + inter.lm$coefficients[[1]] + slope.test <- slope.lm$coefficients[[2]] * frags + slope.lm$coefficients[[1]] + + fdr.out <- exp(slope.test * expr + inter.test) + return(fdr.out) + } + + ### Gene expression values ### + cat("Calculating gene values ...\n") + + count <- 0 + + # unroll chromosomes for speed: + for (chromo in unique(genes$chr)) { + input.chr <- subset(input.df, chr==chromo) + genes.chr <- subset(genes, chr==chromo) + for (i in 1:length(genes.chr$name)) { + # Roll through each gene + + # Note: the original script calculated expression values for a table of proteins, + # here we just limit ourselves to the FDR for the first column past "chr", "start" and "end" + # so if you're thinking of looking at chromatin, for example, put PolII as column 4 in your table! + + gene.start <- genes.chr[i,"start"] - buffer + gene.end <- genes.chr[i,"end"] + buffer + + gene.start <- ifelse(gene.start < 1, 1, gene.start) + + # Create data frames for all gatc fragments covering current gene + exp <- data.frame(input.chr[ (input.chr$start <= gene.end) + & (input.chr$end >= gene.start) + ,] ) + + gatc.num <- length(exp[,1]) + + # skip if no gatc fragments cover gene :( + if (gatc.num == 0) {next} + + # trim to gene boundaries ... + exp$start[1] <- gene.start + exp$end[length(exp[,1])] <- gene.end + + # gene length covered by gatc fragments + len <- sum(exp$end-exp$start) + + # calculate weighted score for each column (representing different proteins) + for (j in 4:length(names(input.chr))) { + avg[j] <- (sum((exp$end-exp$start)*exp[j]))/len + } + + # make data.frame of averages (to be appended to avg.exp) + df <- cbind(avg[1]) + for (k in 2:length(avg)) { + df <- cbind(df,avg[k]) + } + df <- cbind(df,gatc.num) + + # only fdr for first column for now ... + gene.fdr <- fdr(gatc.num,avg[4]) + df <- cbind(df, gene.fdr) + + # append current gene to list + avg.exp <- rbind(avg.exp,data.frame(name=as.character(genes.chr[i,"name"]), df)) + count <- count+1 + if (count %% 50 == 0) {cat(paste(count,"genes averaged ...\r"))} + } + } + + avg.exp <- avg.exp[,c(1,5:(length(names(avg.exp))))] + names(avg.exp) <- c("name",names(input.df)[c(4:(length(names(input.df))))],"gatc.num","FDR") + + return(avg.exp) +} + +for (name in in.files) { + cat(paste("\nNow working on",name,"...\n")) + bname <- basename(name) + fname <- sub("\\..*","",bname,perl=T) + + polii <- read.gff(name,"polii") + + polii.exp <- gene.exp(polii, iter=as.numeric(op.args[["iter"]])) + + out <- subset(polii.exp,FDR < op.args[["fdr"]]) + + write.table(polii.exp,paste(fname,"genes.details.csv",sep="."),row.names=F,col.names=T,quote=T,sep="\t") + write.table(out$name, paste(fname,"genes",sep="."),row.names=F,col.names=F,quote=F) +} + +cat("\nAll done.\n\n")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/details.csv Fri Apr 20 10:27:20 2018 -0400 @@ -0,0 +1,11 @@ +"name" "polii" "gatc.num" "FDR" +"CR46254" 0.356507105200981 5 0.791940813158031 +"CR45339" 1.20628998889217 2 0.260660674330585 +"CR45340" 1.93867996741526 5 0.0511077462101616 +"dbr" 1.2951093442343 22 0.0152079814861417 +"CR44987" 2.49935540607138 24 1.97089127262964e-05 +"galectin" 2.38923790079731 21 9.84816956841954e-05 +"CG11374" 0.00926960982128576 5 1.4451260959191 +"net" 0.663390756192582 14 0.394156836924802 +"Zir" -0.337727067030228 30 181.086928253756 +"CG11377" 0.263311791595758 9 1.16681003833418
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genes.csv Fri Apr 20 10:27:20 2018 -0400 @@ -0,0 +1,5 @@ +CR46254 +CR45339 +CR45340 +dbr +net
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genes.gff Fri Apr 20 10:27:20 2018 -0400 @@ -0,0 +1,16 @@ +2L FlyBase gene 54817 55767 . + . ID=FBgn0267987;Name=CR46254;Ontology_term=SO:0000011,SO:0000087;Dbxref=FlyBase_Annotation_IDs:CR46254;derived_computed_cyto=21B1-21B1 +2L FlyBase ncRNA 54817 55767 . + . ID=FBtr0347585;Name=CR46254-RA;Parent=FBgn0267987;Dbxref=FlyBase_Annotation_IDs:CR46254-RA;score_text=Weakly Supported;score=0 +2L FlyBase gene 65999 66242 . + . ID=FBgn0266878;Name=CR45339;Ontology_term=SO:0000011,SO:0000087;Dbxref=FlyBase_Annotation_IDs:CR45339,EntrezGene:19834983,GenomeRNAi:19834983;derived_computed_cyto=21B1-21B1 +2L FlyBase ncRNA 65999 66242 . + . ID=FBtr0345732;Name=CR45339-RA;Parent=FBgn0266878;Dbxref=FlyBase_Annotation_IDs:CR45339-RA,REFSEQ:NR_123995;score_text=Weakly Supported;score=0 +2L FlyBase gene 66318 66524 . + . ID=FBgn0266879;Name=CR45340;Ontology_term=SO:0000011,SO:0000087;Dbxref=FlyBase_Annotation_IDs:CR45340,EntrezGene:19835538,GenomeRNAi:19835538;derived_computed_cyto=21B1-21B1 +2L FlyBase ncRNA 66318 66524 . + . ID=FBtr0345733;Name=CR45340-RA;Parent=FBgn0266879;Dbxref=FlyBase_Annotation_IDs:CR45340-RA,REFSEQ:NR_123996;score_text=Weakly Supported;score=0 +2L FlyBase gene 66482 71390 . + . ID=FBgn0067779;Name=dbr;fullname=debra;Alias=FBgn0028472,FBgn0043386,CG11371,Dbr,EP456,BcDNA:LD26519,EP0456,Debra;Ontology_term=SO:0000010,SO:0000087,GO:0005771,GO:0007616,GO:0005634,GO:0008270;Dbxref=FlyBase:FBan0011371,FlyBase_Annotation_IDs:CG11371,GB_protein:AAF51565,GB:AA942362,GB:AF184228,GB_protein:AAD55739,GB:AI109558,GB:AI402147,GB:AQ026007,GB:AQ073408,GB:AW942847,GB:AY047576,GB_protein:AAK77308,GB:CL527762,GB:CZ468654,GB:CZ470443,GB:CZ470758,GB:CZ471260,GB:CZ474676,GB:CZ474677,GB:CZ477845,GB:CZ482562,GB:CZ487746,UniProt/TrEMBL:Q9V474,INTERPRO:IPR012934,OrthoDB7_Drosophila:EOG79H5RG,GB_protein:AFH03489,GB_protein:AFH03487,GB_protein:AFH03486,GB_protein:AFH03488,EntrezGene:33161,UniProt/TrEMBL:Q961T5,BIOGRID:59424,FlyAtlas:CG11371-RB,InterologFinder:33161,GenomeRNAi:33161,INTERACTIVEFLY:/genebrief/debra.htm;gbunit=AE014134;derived_computed_cyto=21B1-21B1 +2L FlyBase gene 71039 73836 . - . ID=FBgn0266322;Name=CR44987;Ontology_term=SO:0000011,SO:0000087,SO:0000077;Dbxref=EntrezGene:19835103,FlyBase_Annotation_IDs:CR44987,GenomeRNAi:19835103;derived_computed_cyto=21B1-21B1 +2L FlyBase ncRNA 71039 73836 . - . ID=FBtr0344052;Name=CR44987-RA;Parent=FBgn0266322;Dbxref=REFSEQ:NR_123998,FlyBase_Annotation_IDs:CR44987-RA;score_text=Weakly Supported;score=0 +2L FlyBase ncRNA 71039 73836 . - . ID=FBtr0344053;Name=CR44987-RB;Parent=FBgn0266322;Dbxref=REFSEQ:NR_123997,FlyBase_Annotation_IDs:CR44987-RB;score_text=Weakly Supported;score=0 +2L FlyBase gene 71757 76211 . + . ID=FBgn0031213;Name=galectin;fullname=galectin;Alias=CG11372,Dmgal;Ontology_term=SO:0000010,SO:0000087,GO:0016936,GO:0005829,GO:0015143,GO:0008039,GO:0030246;Dbxref=FlyBase:FBan0011372,FlyBase_Annotation_IDs:CG11372,GB_protein:AAF51564,GB_protein:ACZ94130,GB_protein:ACZ94129,GB:AF338142,GB_protein:AAL87743,GB:AY071294,GB_protein:AAL48916,GB:BI236456,GB:CZ467438,GB:CZ467439,GB:CZ468006,GB:CZ481733,GB:CZ481734,GB:CZ485874,GB:CZ485875,UniProt/TrEMBL:Q9VPI6,INTERPRO:IPR001079,INTERPRO:IPR013320,GB_protein:AGB92325,UniProt/TrEMBL:C0PUZ0,UniProt/TrEMBL:M9NCX9,OrthoDB7_Drosophila:EOG7XQ9R2,OrthoDB7_Diptera:EOG76TN7W,GB_protein:AFH03490,EntrezGene:33162,UniProt/TrEMBL:E1JHP9,UniProt/TrEMBL:E1JHQ0,BDGP_clone:RE32514,OrthoDB7_Insecta:EOG76QSHF,OrthoDB7_Arthropoda:EOG7WTGG5,OrthoDB7_Metazoa:EOG7S7SFG,InterologFinder:33162,BIOGRID:59425,FlyAtlas:CG11372-RA,GenomeRNAi:33162;gbunit=AE014134;derived_computed_cyto=21B1-21B1 +2L FlyBase gene 76348 77783 . + . ID=FBgn0031214;Name=CG11374;Ontology_term=SO:0000010,SO:0000087,GO:0016936,GO:0015143,GO:0030246;Dbxref=FlyBase:FBan0011374,FlyBase_Annotation_IDs:CG11374,UniProt/TrEMBL:Q9VPI7,INTERPRO:IPR001079,INTERPRO:IPR013320,OrthoDB7_Drosophila:EOG735GGT,GB_protein:AAF51563,EntrezGene:33163,OrthoDB7_Insecta:EOG76QSHF,OrthoDB7_Arthropoda:EOG7WTGG5,OrthoDB7_Metazoa:EOG7S7SFG,InterologFinder:33163,FlyAtlas:CG11374-RA,GenomeRNAi:33163;gbunit=AE014134;derived_computed_cyto=21B1-21B1 +2L FlyBase gene 82421 87387 . - . ID=FBgn0002931;Name=net;fullname=net;Alias=FBgn0031215,CG11450,Shout,Group IId,shout,bHLHc42;Ontology_term=SO:0000010,SO:0000087,GO:0005634,GO:0003700,GO:0008586,GO:0006355,GO:0007474,GO:0000122,GO:0046983;Dbxref=FlyBase:FBan0011450,FlyBase_Annotation_IDs:CG11450,GB_protein:AAF51562,GB:AF303350,GB_protein:AAK14073,GB:CZ478919,GB:CZ478940,UniProt/TrEMBL:Q9VPI8,INTERPRO:IPR011598,GB_protein:AGB92326,OrthoDB7_Drosophila:EOG774925,OrthoDB7_Diptera:EOG786TM8,EntrezGene:45339,BDGP_clone:PC00159,UniProt/TrEMBL:Q7KH24,OrthoDB7_Insecta:EOG73VJM7,OrthoDB7_Arthropoda:EOG7SRH34,OrthoDB7_Metazoa:EOG73NG5F,BIOGRID:69604,FlyAtlas:CG11450-RA,InterologFinder:45339,GenomeRNAi:45339,INTERACTIVEFLY:/gene/net1.htm;gbunit=AE014134;derived_computed_cyto=21B1-21B1 +2L FlyBase gene 94739 102086 . + . ID=FBgn0031216;Name=Zir;fullname=Zizimin-related;Alias=FBgn0025528,anon-EST:Posey54,Dm zir,CG11376,FBgn 31216;Ontology_term=SO:0000010,SO:0000087,GO:0035010,GO:0006909,GO:0005089,GO:0035023,GO:0035011,GO:0005575;Dbxref=FlyBase:FBan0011376,FlyBase_Annotation_IDs:CG11376,GB_protein:AAF51561,GB:AA391952,GB:AF083301,GB:AW942162,GB:AZ877629,GB:BT015273,GB_protein:AAT94502,GB:CZ468078,GB:CZ468665,GB:CZ469343,GB:CZ470525,GB:CZ470937,GB:CZ480810,GB:CZ480811,GB:CZ483732,UniProt/TrEMBL:Q9VPI9,INTERPRO:IPR027007,INTERPRO:IPR027357,INTERPRO:IPR026791,OrthoDB7_Drosophila:EOG7DP1D5,OrthoDB7_Diptera:EOG78133C,EntrezGene:33165,INTERPRO:IPR021816,INTERPRO:IPR010703,BDGP_clone:LD10883,OrthoDB7_Insecta:EOG71CSK0,OrthoDB7_Arthropoda:EOG735G9K,OrthoDB7_Metazoa:EOG7P8P7G,BIOGRID:59427,InterologFinder:33165,FlyAtlas:CG11376-RA,Fly-FISH:CG11376,GenomeRNAi:33165;gbunit=AE014134;derived_computed_cyto=21B1-21B2 +2L FlyBase gene 102380 104142 . + . ID=FBgn0031217;Name=CG11377;Alias=CT31762;Ontology_term=SO:0000010,SO:0000087,GO:0005509,SO:0000548;Dbxref=FlyBase:FBan0011377,FlyBase_Annotation_IDs:CG11377,GB_protein:AAF51560,GB:AY071548,GB_protein:AAL49170,GB:BI371522,UniProt/TrEMBL:Q9VPJ0,INTERPRO:IPR000742,INTERPRO:IPR002049,INTERPRO:IPR006212,INTERPRO:IPR009030,INTERPRO:IPR013032,INTERPRO:IPR018097,INTERPRO:IPR021852,INTERPRO:IPR001881,GB_protein:AGB92327,UniProt/TrEMBL:M9PDP6,OrthoDB7_Drosophila:EOG799F97,OrthoDB7_Diptera:EOG7QK9VM,EntrezGene:33166,OrthoDB7_Insecta:EOG7GR8VQ,OrthoDB7_Arthropoda:EOG7PSCW4,OrthoDB7_Metazoa:EOG7Z3F4B,InterologFinder:33166,BIOGRID:59428,FlyAtlas:CG11377-RA,Fly-FISH:CG11377,GenomeRNAi:33166;gbunit=AE014134;derived_computed_cyto=21B2-21B2 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/polII.bed Fri Apr 20 10:27:20 2018 -0400 @@ -0,0 +1,111 @@ +2L 54816 55594 0.0552525047833769 +2L 54816 55594 0.0552525047833769 +2L 55594 55635 0.818636112867944 +2L 55594 55635 0.818636112867944 +2L 55635 55767 1.09961409723048 +2L 55635 55767 1.09961409723048 +2L 65998 66242 1.20628998889217 +2L 65998 66242 1.20628998889217 +2L 66317 66422 1.20628998889217 +2L 66317 66422 1.20628998889217 +2L 66422 66524 2.5583945646271 +2L 66422 66524 2.5583945646271 +2L 66481 67274 2.5583945646271 +2L 67274 67324 1.98680486782346 +2L 67324 67817 2.08837880305762 +2L 67817 67905 1.86112283695141 +2L 67905 68330 1.62486655160691 +2L 68330 68393 1.54900967010231 +2L 68393 68886 1.11119688964443 +2L 68886 69103 0.869617644974253 +2L 69103 69241 0.387235732197361 +2L 69241 69301 0.418133867661119 +2L 69301 69792 0.899479539841806 +2L 69792 69796 0.823796983227429 +2L 69796 69820 0.823796983227429 +2L 69820 69878 0.549942264108176 +2L 69878 69959 0.761892304255089 +2L 69959 70111 1.1531457850107 +2L 70111 71390 0.846518854258207 +2L 71038 71492 0.846518854258207 +2L 71038 71492 0.846518854258207 +2L 71038 71492 0.846518854258207 +2L 71492 71990 3.39665608645984 +2L 71492 71990 3.39665608645984 +2L 71492 71990 3.39665608645984 +2L 71756 71990 3.39665608645984 +2L 71990 72652 3.40816353628327 +2L 71990 72652 3.40816353628327 +2L 71990 72652 3.40816353628327 +2L 71990 72652 3.40816353628327 +2L 72652 72787 2.8478823465353 +2L 72652 72787 2.8478823465353 +2L 72652 72787 2.8478823465353 +2L 72652 72787 2.8478823465353 +2L 72787 73420 2.9137462183582 +2L 72787 73420 2.9137462183582 +2L 72787 73420 2.9137462183582 +2L 72787 73420 2.9137462183582 +2L 73420 73836 1.07916756002746 +2L 73420 73836 1.07916756002746 +2L 73420 73836 1.07916756002746 +2L 73420 75584 1.07916756002746 +2L 75584 76211 0.351389501360189 +2L 76347 76557 0.351389501360189 +2L 76557 76602 0.111868083546615 +2L 76602 76694 -0.103143189574513 +2L 76694 77646 -0.0832154089110144 +2L 77646 77783 0.171807761122692 +2L 82420 82686 0.950799825059922 +2L 82686 82987 0.860805501917355 +2L 82987 83358 0.9016755036993 +2L 83358 83622 0.4912523962063 +2L 83622 83828 0.574814522065537 +2L 83828 84509 0.950829410021384 +2L 84509 84651 1.13276692007025 +2L 84651 84867 1.40640395753805 +2L 84867 85682 0.810578385233812 +2L 85682 85687 0.675864965432377 +2L 85687 86075 0.661825886242909 +2L 86075 86215 0.349192348167705 +2L 86215 86316 0.0920730976226941 +2L 86316 87387 0.102318953834602 +2L 94738 95132 -0.106500647321927 +2L 95132 95199 -0.0638569168395719 +2L 95199 95277 0.217427011118734 +2L 95277 95514 0.267705566854591 +2L 95514 95662 0.0612754941064784 +2L 95662 95731 -0.429957911575681 +2L 95731 95761 -0.556019144100863 +2L 95761 96385 -0.725477533205267 +2L 96385 96412 -0.655603418929584 +2L 96412 96491 -0.409536719736943 +2L 96491 97369 -0.226244771136778 +2L 97369 97583 -0.308700411197029 +2L 97583 97642 0.0394679913725759 +2L 97642 97825 0.217457643667941 +2L 97825 98118 0.0793574230565764 +2L 98118 98244 -0.00404328552099076 +2L 98244 98316 -0.285051719568665 +2L 98316 98529 -0.52337489352199 +2L 98529 98813 -0.853641568435678 +2L 98813 99183 -1.20840278715919 +2L 99183 99266 -0.57988161546026 +2L 99266 99666 -0.0718062889470329 +2L 99666 99974 -0.313463689842528 +2L 99974 100074 -0.962054132059592 +2L 100074 100636 -1.19450144164682 +2L 100636 100841 -0.134973412111241 +2L 100841 101514 0.0638744109411819 +2L 101514 101553 -0.0533119771867642 +2L 101553 102022 -0.0462385325318383 +2L 102379 102687 -0.0746051708557362 +2L 102022 102086 -0.0746051708557362 +2L 102687 102730 -0.0466007641328778 +2L 102730 102832 0.129772291996865 +2L 102832 103057 0.328567918234615 +2L 103057 103345 0.287560490314009 +2L 103345 103349 0.278800162489859 +2L 103349 103506 -0.00904364838917334 +2L 103506 103683 0.126271941123851 +2L 103683 104142 0.646700520857224