# HG changeset patch # User alvarofaure # Date 1544617120 18000 # Node ID 7fdf47a0bae870892e85549952c242834dbcf1f8 Uploaded diff -r 000000000000 -r 7fdf47a0bae8 chromeister/README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/README.md Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,36 @@ +# chromeister +An ultra fast, heuristic approach to detect conserved signals in extremely large pairwise genome comparisons. + +## Requirements + +GCC compiler (any version that is not completely outdated should do) and the R-base package (default R installation should do) with no extra packages. +Simply download the .zip and unzip it, or clone the repository (currently the active branch is "histo-kmer-ultra" DO NOT USE ANY OTHER). +Then issue the following commands: + +cd chromeister/src && make all + +You are ready to go! + +## Use + +There are several ways in which CHROMEISTER can be used. The simplest one is to run a 1-vs-1 comparison and then compute the score and the plot. +To do so, use the binaries at the bin folder: + +CHROMEISTER -query seqX -db seqY -out dotplot.mat && Rscript compute_score.R dotplot.mat + +This will generate the comparison matrix, the plot of the comparison with the automatic scoring and the guides to be used in an exhaustive GECKO comparison. + +More to be added soon! + + + + + + + + + + + + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/CHROMEISTER Binary file chromeister/bin/CHROMEISTER has changed diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/allVsAll.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/allVsAll.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +DIR=$1 +EXT=$2 +DIM=$3 +KMER=$4 + +array=() +x=0 + +if [ $# != 4 ]; then + echo "***ERROR*** Use: $0 genomesDirectory extension dim kmer" + exit -1 +fi + +indexnameA=$(basename "$DIR") + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +for elem in $(ls -d $DIR/*.$EXT | awk -F "/" '{print $NF}' | awk -F ".$EXT" '{print $1}') +do + array[$x]=$elem + x=`expr $x + 1` + #echo "X: $elem" +done + +for ((i=0 ; i < ${#array[@]} ; i++)) +do + for ((j=i ; j < ${#array[@]} ; j++)) + do + if [ $i != $j ]; then + seqX=${array[$i]} + seqY=${array[$j]} + echo "----------${seqX}-${seqY}-----------" + + + if [[ ! -f ${seqX}.$EXT-${seqY}.$EXT.mat ]]; then + + $BINDIR/run_and_plot_chromeister.sh $DIR/${seqX}.$EXT $DIR/${seqY}.$EXT $KMER $DIM + #Rscript $BINDIR/compute_score.R $seqX.$EXT-$seqY.$EXT.mat $DIM > $seqX.$EXT-$seqY.$EXT.scr.txt + fi + + fi + done +done + +# generate index +if [[ ! -f index.csv.temp ]] && [ ! -f index-$indexnameA.csv ]; then + + echo "Launching... $BINDIR/index_chromeister_solo.sh index-$indexnameA.csv $DIR $DIR" + $BINDIR/index_chromeister_solo.sh . index-$indexnameA.csv $DIR $DIR +fi + + + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/allVsAll_incremental.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/allVsAll_incremental.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +DIR=$1 +DIR2=$2 +EXT=$3 +DIM=$4 +KMER=$5 + + + +if [ $# != 5 ]; then + echo "***ERROR*** Use: $0 genomesDirectory1 genomesDirectory2 extension dim kmer" + exit -1 +fi + +indexnameA=$(basename "$DIR") +indexnameB=$(basename "$DIR2") + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + + +array=() +x=0 +array2=() + +for elem in $(ls -d $DIR/*.$EXT | awk -F "/" '{print $NF}' | awk -F ".$EXT" '{print $1}') +do + array[$x]=$elem + x=`expr $x + 1` + #echo "X: $elem" +done + +x=0 + +for elem in $(ls -d $DIR2/*.$EXT | awk -F "/" '{print $NF}' | awk -F ".$EXT" '{print $1}') +do + array2[$x]=$elem + x=`expr $x + 1` + #echo "X: $elem" +done + +for ((i=0 ; i < ${#array[@]} ; i++)) +do + for ((j=0 ; j < ${#array2[@]} ; j++)) + do + seqX=${array[$i]} + seqY=${array2[$j]} + echo "----------${seqX}-${seqY}-----------" + + + #echo "$BINDIR/run_and_plot_chromeister.sh $DIR/${seqX}.$EXT $DIR/${seqY}.$EXT 30 10000" + if [[ ! -f ${seqX}.$EXT-${seqY}.$EXT.mat ]]; then + $BINDIR/run_and_plot_chromeister.sh $DIR/${seqX}.$EXT $DIR2/${seqY}.$EXT $KMER $DIM + fi + + done +done + + +# generate index +if [[ ! -f index.csv.temp ]] && [ ! -f index-$indexnameA-$indexnameB.csv ]; then + + echo "Launching... $BINDIR/index_chromeister_solo.sh . index-$indexnameA-$indexnameB.csv $DIR $DIR2" + $BINDIR/index_chromeister_solo.sh . index-$indexnameA-$indexnameB.csv $DIR $DIR2 +fi + + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/compute_score.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/compute_score.R Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,478 @@ + +#!/usr/bin/env Rscript +args = commandArgs(trailingOnly=TRUE) + +if(length(args) < 2){ + stop("USE: Rscript --vanilla plot.R ") +} + + + + + + +growing_regions <- function(mat, reward = 6, penalty = 5, sidePenalty = 3, MAXHSPS = 500, TH = 10, WSIZE = 7){ + + #write(mat, file ="data.txt", ncolumns = 200) + + len <- min(length(mat[1,]), length(mat[,1])) + HSPS <- matrix(0, nrow=MAXHSPS, ncol=5) + idx <- 1 + lH <- round(WSIZE/2) - 1 + rH <- round(WSIZE/2) + 1 + + + #print(paste("lH: ", lH, "rH: ", rH)) + + if(WSIZE %% 2 == 0){ + print("WSIZE MUST BE ODD") + stop() + } + + + + i <- 1 + #readline(prompt="Press [enter] to continue") + while(i < len){ + + value <- max(mat[i,]) * reward + if(value == 0) i <- i + 1 + pos <- which.max(mat[i,]) + # these two hold ending frag + endfrag <- pos + j <- i + count_penalties <- 1 + + #print("-----------------") + + while(value > 0 && j < len){ + + #print(paste("took values ", j, endfrag, "which have score of ", mat[j, endfrag], "current value is: ", value)) + + + + + # Reset position used + + mat[max(1,j-1), max(1,endfrag-2)] <- 0 + mat[max(1,j-1), max(1,endfrag-1)] <- 0 + mat[max(1,j-1), endfrag] <- 0 + mat[max(1,j-1), min(len, endfrag+1)] <- 0 + mat[max(1,j-1), min(len, endfrag+2)] <- 0 + + mat[j, max(1,endfrag-2)] <- 0 + mat[j, max(1,endfrag-1)] <- 0 + mat[j, endfrag] <- 0 + mat[j, min(len, endfrag+1)] <- 0 + mat[j, min(len, endfrag+2)] <- 0 + + #print(paste("Erasing: (",max(1,j-1), max(1,endfrag-2),")(",max(1,j-1), max(1,endfrag-1),")(",max(1,j-1), endfrag, + # ")(",max(1,j-1), min(len, endfrag+1),")(",max(1,j-1), min(len, endfrag+2),")(",j, max(1,endfrag-2), + # ")(",j, max(1,endfrag-1),")(",j, endfrag,")(",j, min(len, endfrag+1),")(",j, min(len, endfrag+2),")")) + + + # Go for next + j <- j + 1 + # Check next, could be reverse or forward + mleft <- max(1, endfrag-lH) + mright <- min(len, endfrag+lH) + window <- mat[j, mleft:mright] + + + #print(paste("mleft: ", mleft, "mright", mright, "j is: ", j)) + #print(window) + + v <- max(window) + selected <- which.max(window) + # Make it rather go diagonally + #print(paste("WIndow len: ", length(window))) + chose_diagonal <- FALSE + if(length(window) == WSIZE && v == window[lH]){ + selected <- lH + chose_diagonal <- TRUE + } + + if(length(window) == WSIZE && v == window[rH]){ + selected <- rH + chose_diagonal <- TRUE + } + + #print(paste("Selected value be like ", selected)) + + + if(v != 0){ + + + + endfrag <- (mleft + selected ) # To make the indexing + if(length(window) == WSIZE) endfrag <- endfrag - 1 + endfrag <- max(1, min(len, endfrag)) + #print(paste("\t", " endfragnew = ", endfrag, "max of window: ", max(window), "on position", which.max(window))) + } + + #print(paste("Chose diagonal is ", chose_diagonal)) + + # If no similarity is found + if(v == 0){ + value <- value - count_penalties * penalty + count_penalties <- count_penalties + 1 + #print("Got penalty #########") + + }else{ + + # Similarity is found + + if(!chose_diagonal){ + value <- value + count_penalties * (-sidePenalty) + count_penalties <- count_penalties + 1 + #print("Got SIDE penalty @@@ #########") + }else{ + count_penalties <- 1 + value <- value + reward + #print("Got reward thou #########") + } + + } + #readline(prompt="Press [enter] to continue") + + } + # Check len of frag + if(j-i > TH){ + # HSPS[idx, 1] <- i + # HSPS[idx, 2] <- pos + # HSPS[idx, 3] <- j + # HSPS[idx, 4] <- endfrag + # HSPS[idx, 5] <- abs(i-j) + + HSPS[idx, 1] <- pos + HSPS[idx, 2] <- i + HSPS[idx, 3] <- endfrag + HSPS[idx, 4] <- j + HSPS[idx, 5] <- abs(i-j) + + + + #print(paste("ACCEPT", i, pos, j, endfrag, "x-y", j-i, abs(endfrag-pos), sep = " ")) + idx <- idx + 1 + }else{ + #print("REJECT") + } + # This will prevent overlappign lines, I think + #i <- j + + if(idx == MAXHSPS) break + } + + + + + return (HSPS) +} + +detect_events <- function(HSPS, sampling){ + + DIAG_SEPARATION <- 10 + # same as HSPS but adding the event + output <- matrix(0, nrow=length(HSPS[,1]), ncol=1+length(HSPS[1,])) + colnames(output) <- c("x1", "y1", "x2", "y2", "len", "event") + j <- 0 + for(i in 1:(length(HSPS[,1]))){ + + if(sum(HSPS[i,]) > 0){ + j <- j + 1 + is_inverted = FALSE + is_diagonal = TRUE + + if(HSPS[i,1] > HSPS[i,3]) is_inverted = TRUE + if(abs(HSPS[i,1] - HSPS[i,2]) > DIAG_SEPARATION && abs(HSPS[i,3] - HSPS[i,4]) > DIAG_SEPARATION) is_diagonal = FALSE + + output[i,1] <- HSPS[i,1] * sampling + output[i,2] <- HSPS[i,2] * sampling + output[i,3] <- HSPS[i,3] * sampling + output[i,4] <- HSPS[i,4] * sampling + output[i,5] <- HSPS[i,5] * sampling + + if(is_diagonal) output[i,6] <- "synteny block" + if(is_diagonal && is_inverted) output[i,6] <- "inversion" + if(!is_diagonal && !is_inverted) output[i,6] <- "transposition" + if(!is_diagonal && is_inverted) output[i,6] <-"inverted transposition" + } + + + } + + return (output[1:j,]) +} + + + +paint_frags <- function(HSPS, l, sampling){ + + plot(c(HSPS[1,1]*sampling, HSPS[1,3]*sampling), c(HSPS[1,2]*sampling, HSPS[1,4]*sampling), xlim = c(1,l*sampling), ylim = c(1,l*sampling), type="l", xlab="X-genome",ylab="Y-genome") + for(i in 2:length(HSPS[,1])){ + if(sum(HSPS[i,]) > 0){ + lines(c(HSPS[i,1]*sampling, HSPS[i,3]*sampling), c(HSPS[i,2]*sampling, HSPS[i,4]*sampling)) + } + } +} + +supersample <- function(mat, upscale){ + l <- min(length(mat[1,]), length(mat[,1])) + size <- round(l*upscale) + m <- matrix(0, nrow=size, ncol=size) + + for(i in 1:size){ + for(j in 1:size){ + mleft <- max(1, i-1) + mright <- min(l, i+1) + mup <- max(1, j-1) + mdown <- min(l, j+1) + + ri <- max(1, i/upscale) + rj <- max(1, j/upscale) + + if(mat[ri, rj] > 0){ + m[i, j] <- 1 + } + } + } + return (m) +} + + +downsample <- function(mat, downscale){ + l <- min(length(mat[1,]), length(mat[,1])) + size <- round(l/downscale) + m <- matrix(0, nrow=size, ncol=size) + + for(i in 1:l){ + for(j in 1:l){ + mup <- max(1, i-1) + mdown <- min(l, i+1) + mleft <- max(1, j-1) + mright <- min(l, j+1) + + #print(paste("Matrix at ", i, j)) + #print(mat[mup:mdown, mleft:mright]) + + if(sum(mat[mup:mdown, mleft:mright]) > 0){ + #print(paste("goes to ", round(i/downscale), round(j/downscale))) + m[round(i/downscale), round(j/downscale)] <- 1 + } + } + } + + return (m) +} + + + + + + + +path_mat = args[1] + + +fancy_name <- strsplit(path_mat, "/") +fancy_name <- (fancy_name[[1]])[length(fancy_name[[1]])] + +# Read sequence lenghts +con <- file(path_mat,"r") +seq_lengths <- readLines(con, n=2) +seq_x_len <- as.numeric(seq_lengths[1]) +seq_y_len <- as.numeric(seq_lengths[2]) +close(con) + + +data <- as.matrix(read.csv(path_mat, sep = " ", skip=2)) + +# Get sequence names +name_x <- strsplit(fancy_name, "-")[[1]][1] +name_y <- strsplit(fancy_name, "-")[[1]][2] + +# Max of columns + +len_i <- as.numeric(args[2]) +len_j <- as.numeric(args[2]) + + +score_density <- data +aux_density <- data + + +pmax_pos <- which.max(aux_density[,1]) +for(i in 1:len_i){ + + cmax_pos <- which.max(aux_density[i,]) # get max of row + + if((aux_density[i,cmax_pos]) > 0){ # if it has value + #row_from_col <- which.max(aux_density[,cmax_pos]) # get max of the column pointed by maximum of row + score_density[i,] <- 0 # put all others in row to 0 i.e. only use this max, EXCEPT for the maximum in the column + #score_density[row_from_col, cmax_pos] <- 1 + score_density[i,cmax_pos] <- 1 + pmax_pos <- cmax_pos + } +} + + +pmax_pos <- which.max(aux_density[1,]) + + +for(i in 1:len_i){ + + cmax_pos <- which.max(aux_density[,i]) # get max of column + + if((aux_density[cmax_pos,i]) > 0){ + score_density[,i] <- 0 + score_density[cmax_pos,i] <- 1 + pmax_pos <- cmax_pos + } +} + + + + + + + + + +score_copy <- score_density +diag_len <- 4 + +for(i in 6:(len_i-5)){ + for(j in 6:(len_j-5)){ + + value <- 0 + for(w in (-diag_len/2):(diag_len/2)){ + if(score_density[i+w,j+w] > 0){ + value <- value + 1 + } + } + + if(value >= diag_len){ + for(k in 1:5){ + score_copy[i+k,j+k] <- 1 + } + for(k in 1:5){ + score_copy[i-k,j-k] <- 1 + } + }else if(score_copy[i,j]==0){ + score_copy[i,j] <- 0 + } + } +} + +for(i in 6:(len_i-5)){ + for(j in 6:(len_j-5)){ + + value <- 0 + for(w in (-diag_len/2):(diag_len/2)){ + if(score_density[i-w,j+w] > 0){ + value <- value + 1 + } + } + + if(value >= diag_len){ + for(k in 1:5){ + score_copy[i-k,j+k] <- 1 + } + for(k in 1:5){ + score_copy[i+k,j-k] <- 1 + } + }else if(score_copy[i,j]==0){ + score_copy[i,j] <- 0 + } + } +} + + +# Kernel to remove single points + +for(i in 1:(length(score_copy[,1]))){ + for(j in 1:(length(score_copy[1,]))){ + + value <- 0 + + min_i <- max(1, i-1) + max_i <- min(len_i, i+1) + min_j <- max(1, j-1) + max_j <- min(len_j, j+1) + + value <- sum(score_copy[min_i:max_i, min_j:max_j]) + + if(value < 2) score_copy[i,j] <- 0 + } +} + +# To compute the score +score <- 0 +pmax_pos <- which.max(score_copy[,1]) +dist_th <- 1.5 +besti <- 1 +bestj <- pmax_pos +dvec1 <- abs(which.max(score_copy[,2]) - which.max(score_copy[,1])) +dvec2 <- abs(which.max(score_copy[,3]) - which.max(score_copy[,2])) +dvec3 <- abs(which.max(score_copy[,4]) - which.max(score_copy[,3])) +for(i in 5:len_i){ + + #print(paste(paste(paste(dvec1, dvec2), dvec3), mean(c(dvec1, dvec2, dvec3)))) + distance <- mean(c(dvec1, dvec2, dvec3)) + + + + dvec1 <- dvec2 + dvec2 <- dvec3 + dvec3 <- abs(which.max(score_copy[,i]) - which.max(score_copy[,i-1])) + + # If there is a 0 or we are too far away just add max distance! + if(distance > dist_th || distance == 0){ + score <- score + len_i + } + +} + + +score <- (score/(len_i^2)) +print(score) + + +sampling_value <- 5 +submat <- downsample(score_copy, sampling_value) +m <- growing_regions((submat), WSIZE = 7, TH = 5, penalty = 15) +events <- detect_events(m, sampling_value) +events <- rbind(events, c(0,0,0,0,0,"Null event")) +write(as.character(c(seq_x_len, seq_y_len)), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=2) +write(as.character(c("x1", "y1", "x2", "y2", "len", "event")), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6) +for(i in 1:length(events[,1])){ + write(as.character(events[i,]), file=paste(path_mat,".events.txt", sep=""), append = TRUE, sep =",", ncolumns=6) +} + + + +coords1 <- round(seq(from=0, to=1, by=0.2)*seq_x_len) +coords2 <- round(seq(from=0, to=1, by=0.2)*seq_y_len) + +final_image <- apply((t(score_copy)), 2, rev) + +png(paste(path_mat, ".filt.png", sep=""), width = length(data[,1]), height = length(data[,1])) +image(t(final_image), col = grey(seq(1, 0, length = 256)), xaxt='n', yaxt='n', main = paste(fancy_name, paste("filt. score=", score)), xlab = name_x, ylab = name_y, axes = FALSE) +axis(1, tick = TRUE, labels = (coords1), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1)) +axis(2, tick = TRUE, labels = rev(coords2), at = c(0.0, 0.2, 0.4, 0.6, 0.8, 1)) +dev.off() + + +# Output pixel coordinates of highly conserved signals + +# To clear it in case it existed +write(c(paste("X", "Y")), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = FALSE, sep = "\n") + + +for(i in 1:(length(score_copy[,1]))){ + for(j in 1:(length(score_copy[1,]))){ + if(score_copy[i,j] != 0){ + write(c(paste(i, j)), file = paste("hits-XY-", paste(fancy_name, ".hits", sep=""), sep=""), append = TRUE, sep = "\n") + } + } +} diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/generate-one-score.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/generate-one-score.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,181 @@ +#!/usr/bin/env bash +CSV=$1 +TH=$2 + +if [ $# -ne 2 ]; then + echo " ==== ERROR ... you called this script inappropriately." + echo "" + echo " usage: $0 " + echo "" + exit -1 +fi + + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# get first genome in list (they are sorted) +currgenome=$(tail -n +2 "$CSV" | head -1 | awk -F "," '{print $6}') + +# fill array of chromosomes similarity + +array=() +arraytosort=() +names=() +homologies=() +condition=0 +othergencounter=0 +# for problems with chromo X and Y +highest=1 +# For all lines + +cat $CSV | tail -n +2 > $1.temp + +while IFS= read -r i +do + + othergenome=$(echo "$i" | awk -F "," '{print $6}') + if [ "$condition" -eq 0 ]; then + currgenome=$othergenome + condition=1 + fi + + if [ "$othergenome" != "$currgenome" ]; then + + # Sort the array with temporal values + #printf '%s\n' "${arraytosort[@]}" + #echo "name is $currgenome" + + sorted=($(printf '%s\n' "${arraytosort[@]}"|sort)) + + #echo "For chroomo $currgenome we have " + #echo $(printf '%s,' "${sorted[@]}") + # accumulate sum until threshold is reached + usedValues=1 + usedValuesNext=2 + first=${sorted[0]} + next=${sorted[${usedValues}]} + nextofnext=${sorted[${usedValuesNext}]} + finalvalue=$first + divisor=0 + currdiff=$(LC_NUMERIC=POSIX awk -v a="$next" -v b="$nextofnext" 'BEGIN {print b-a }') + TH=$(printf '%4.6f' $TH) + #echo "$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("comp %f > %f = %d",a,b,a>b)} ')" + condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d",a>b)} ') + #echo "first $first next $next result $currdiff condition $condition divisor $divisor th $TH finalvalue $finalvalue" + while [ $condition -eq 1 -a $usedValuesNext -lt ${#sorted[@]} ]; + do + usedValues=`expr $usedValues + 1` + usedValuesNext=`expr $usedValuesNext + 1` + finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$next" 'BEGIN {print (a+b)}') + next=${sorted[${usedValues}]} + nextofnext=${sorted[${usedValuesNext}]} + + currdiff=$(LC_NUMERIC=POSIX awk -v a="$nextofnext" -v b="$next" 'BEGIN {printf("%f", b-a) }') + condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d", a>b)} ') + divisor=$(LC_NUMERIC=POSIX awk -v a="$divisor" 'BEGIN {print a+0.1}') + + done + + #echo "so this is what we got $finalvalue, when divided using $divisor" + + # array holds the results + #array[$highest]=$(awk -v a="$currsum" -v b="$othergencounter" 'BEGIN {print a/b}') + finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$usedValues" -v c="$divisor" 'BEGIN {printf("%f", a/(b-c))}') + array[$highest]=$finalvalue + homologies[$highest]=$usedValues + + highest=`expr $highest + 1` + condition=0 + names+=($currgenome) + othergencounter=0 + unset arraytosort + + + + + getvalue=$(echo "$i" | awk -F "," '{print $8}') + # Copy value to array + arraytosort[$othergencounter]=$getvalue + #currsum=$(awk -v a="$currsum" -v b="$getvalue" 'BEGIN {print a=a+(1-b); exit}') + othergencounter=`expr $othergencounter + 1` + else + getvalue=$(echo "$i" | awk -F "," '{print $8}') + # Copy value to array + arraytosort[$othergencounter]=$getvalue + #currsum=$(awk -v a="$currsum" -v b="$getvalue" 'BEGIN {print a=a+(1-b); exit}') + othergencounter=`expr $othergencounter + 1` + + fi + +done < "$1.temp" + +# do the last!!! +#if [ "$lastprint" == "$currgenome" ]; then +# Sort the array with temporal values +sorted=($(printf '%s\n' "${arraytosort[@]}"|sort)) + +usedValues=1 +usedValuesNext=2 +first=${sorted[0]} +next=${sorted[${usedValues}]} +nextofnext=${sorted[${usedValuesNext}]} +finalvalue=$first +divisor=0 +currdiff=$(LC_NUMERIC=POSIX awk -v a="$next" -v b="$nextofnext" 'BEGIN {print b-a }') +TH=$(printf '%4.6f' $TH) +#echo "$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("comp %f > %f = %d",a,b,a>b)} ')" +condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d",a>b)} ') +#echo "first $first next $next result $currdiff condition $condition divisor $divisor th $TH finalvalue $finalvalue" +while [ $condition -eq 1 -a $usedValuesNext -lt ${#sorted[@]} ]; +do + usedValues=`expr $usedValues + 1` + usedValuesNext=`expr $usedValuesNext + 1` + finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$next" 'BEGIN {print (a+b)}') + next=${sorted[${usedValues}]} + nextofnext=${sorted[${usedValuesNext}]} + + currdiff=$(LC_NUMERIC=POSIX awk -v a="$nextofnext" -v b="$next" 'BEGIN {printf("%f", b-a) }') + condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d", a>b)} ') + divisor=$(LC_NUMERIC=POSIX awk -v a="$divisor" 'BEGIN {print a+0.1}') + +done + +#echo "so this is what we got $finalvalue, when divided using $divisor" + +# array holds the results +#array[$highest]=$(awk -v a="$currsum" -v b="$othergencounter" 'BEGIN {print a/b}') +finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$usedValues" -v c="$divisor" 'BEGIN {printf("%f", a/(b-c))}') +array[$highest]=$finalvalue +homologies[$highest]=$usedValues + + +highest=`expr $highest + 1` +currsum=0 +names+=($currgenome) +currgenome=$othergenome +othergencounter=0 +#fi + + +highest=`expr $highest - 1` +rm $1.temp + +tsum=0 +echo "deleteme" > $1.inter +rm $1.inter +aux=0 +for ((i = 1; i <= highest; i++)); do + echo "${names[${aux}]} ${array[${i}]} ${homologies[${i}]}" >> $1.inter + #echo "${names[${aux}]} ${array[${i}]} ${homologies[${i}]}" + aux=`expr $aux + 1` + #val=${array[${i}]} + #tsum=$(awk -v a="$tsum" -v b="$val" '{print a=a+b}') +done + +#awk -v a="$tsum" b="$highest" '{print a/b}' + +#sumfirst=$(awk -F "," 'BEGIN{suma=0}{suma = suma + $8}END{print suma}' "$CSV") +#echo "$sumfirst" + + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/index_chromeister.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/index_chromeister.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +DIR=$1 +FASTAS1=$3 +FASTAS2=$4 +OUT=$2 + +echo "Computing index CSV..." > index.csv.temp + +while [ "$(find . -size 0 | wc -l)" -ne 0 ]; do + sleep 10s +done + + +EXT="mat" +EXTSCORE="scr.txt" +EXTGENERAL=".fa.fasta" + + + +if [ $# != 4 ]; then + echo "***ERROR*** Use: $0 " + exit -1 +fi + +rm $DIR/*.log + +for i in $DIR/*.scr; do mv $i $i.txt; done + +rm $OUT + +for elem in $(ls -d $DIR/*.$EXT | awk -F "/" '{print $NF}' | awk -F ".$EXT" '{print $1}') +do + IFS='-', read -a splits <<< "$elem" + IFS='.', read -a getnum <<< "$elem" + + scorepath=$(basename $elem .mat).$EXTSCORE + + sed -i "/X.*/d" $DIR/$scorepath + sed -i "s/\[1\]//g" $DIR/$scorepath + + score=$( head -1 $DIR/$scorepath) + + file1=${splits[0]} + file2=$(basename ${splits[1]} .mat) + + ID1=$(head -1 $FASTAS1/$file1) + ID2=$(head -1 $FASTAS2/$file2) + + + #scorepath="$(basename $elem .mat).$EXTSCORE" + #score="$(head -1 $DIR/$scorepath)" + + counter=0 + numX=0 + numY=0 + for i in "${getnum[@]}" + do + counter=`expr $counter + 1` + if [ "$numX" -eq 0 ] && [ "$i" == "chromosome" ]; then + numX=$counter + continue + fi + if [ "$numX" -ne 0 ] && [ "$i" == "chromosome" ]; then + numY=$counter + fi + + done + + + echo "$(basename ${splits[0]} $EXTGENERAL),$(basename ${splits[1]} ${EXTGENERAL}.mat),$ID1,$ID2,$elem.$EXT.filt.png,${getnum[${numX}]},${getnum[${numY}]},$score" >> $OUT + +done + +sort -k5,5n -k6,6n -o $OUT $OUT + + +sed -i '1iSpX, SpY, IDX, IDY, IMG, CHNumberX, CHNumberY, Score' $OUT + +rm index.csv.temp + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/index_chromeister_solo.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/index_chromeister_solo.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +DIR=$1 +FASTAS1=$3 +FASTAS2=$4 +OUT=$2 + +echo "Computing index CSV..." > index.csv.temp + +while [ "$(find . -size 0 | wc -l)" -ne 0 ]; do + sleep 10s +done + + +EXT="mat" +EXTSCORE="scr.txt" +EXTGENERAL=".fasta" + +# data is like +# HOMSA.Chr.10.fasta + + + +if [ $# != 4 ]; then + echo "***ERROR*** Use: $0 " + exit -1 +fi + + +rm $OUT + +for elem in $(ls -d $DIR/*.$EXT | awk -F "/" '{print $NF}' | awk -F ".$EXT" '{print $1}') +do + + IFS='-', read -a splits <<< "$elem" # yields MUSMU.Chr.8.fasta and MUSMU.Chr.Y.fasta + IFS='.', read -a getnum <<< "$elem" # yields MUSMU Chr 8 fasta and MUSMU Chr Y fasta + + scorepath=$(basename $elem .mat).$EXTSCORE + + sed -i "/X.*/d" $DIR/$scorepath + sed -i "s/\[1\]//g" $DIR/$scorepath + + score=$( head -1 $DIR/$scorepath) + len1=$( head -1 $DIR/$elem.mat.events.txt | awk -F="," '{print $1}') + len2=$( head -1 $DIR/$elem.mat.events.txt | awk -F="," '{print $2}') + + + + file1=${splits[0]} + file2=$(basename ${splits[1]} .mat) + + ID1=$(head -1 $FASTAS1/$file1) + ID2=$(head -1 $FASTAS2/$file2) + + + #scorepath="$(basename $elem .mat).$EXTSCORE" + #score="$(head -1 $DIR/$scorepath)" + + counter=0 + numX=${getnum[2]} + numY=${getnum[5]} + + + + + echo "$(basename ${splits[0]} $EXTGENERAL),$(basename ${splits[1]} ${EXTGENERAL}),$ID1,$ID2,$elem.$EXT.filt.png,$numX,$numY,$score,$len1 $len2" >> $OUT + +done + +sort -k5,5n -k6,6n -o $OUT $OUT + + +sed -i '1iSpX, SpY, IDX, IDY, IMG, CHNumberX, CHNumberY, Score, LengthX, LengthY' $OUT + +rm index.csv.temp + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/make-cluster.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/make-cluster.R Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,54 @@ +#!/usr/bin/env Rscript +library(ape) + +args = commandArgs(trailingOnly=TRUE) + +if(length(args) < 1){ + stop("USE: Rscript --vanilla make-cluster.R ") +} + + +path = args[1] + +inputcsv <- read.csv(path, sep ="-", header=FALSE) + +# Make indexing table + +indexing_table <- table(inputcsv[,1]) +n_species <- length(indexing_table) + +for(i in 1:n_species){ + indexing_table[i] <- i +} + + +distmat <- matrix(NA, nrow=n_species, ncol=n_species) +rownames(distmat) <- rownames(indexing_table) +colnames(distmat) <- rownames(indexing_table) + +for(i in 1:n_species){ + distmat[i,i] <- 0 +} + + +for(i in 1:length(inputcsv[,1])){ + redirX <- (as.character(inputcsv[i,1])) + redirY <- (as.character(inputcsv[i,2])) + + distmat[indexing_table[redirX],indexing_table[redirY]] <- as.numeric(inputcsv[i,3]) + distmat[indexing_table[redirY],indexing_table[redirX]] <- as.numeric(inputcsv[i,3]) +} + +cluster <- hclust(dist(distmat), method = "average") + +# Rooted hierarchical cluster +#plot(cluster, main = "Clustering based on automatic scoring", xlab = "Organisms") + +# Unrooted +plot(as.phylo(cluster), type = "unrooted", cex = 0.6, main = "Unrooted clustering based on automatic") + +# Fan cluster +#plot(as.phylo(cluster), type = "fan") + + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/make-mean.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/make-mean.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Use in folder prior to all folders! + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +for i in *; do + + + $BINDIR/generate-one-score.sh $i/index-$i.csv 0.2 + value=$(awk '{sum+=$2} END { print sum/NR}' $i/index-$i.csv.inter) + echo "$i-$value" + + +done + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/plot.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/plot.R Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,17 @@ +#!/usr/bin/env Rscript +args = commandArgs(trailingOnly=TRUE) + +if(length(args) < 1){ + stop("USE: Rscript --vanilla plot.R ") +} + + +path_mat = args[1] + + +data <- read.csv(path_mat, header = FALSE, sep = " ", skip=1) + + +png(paste(path_mat, ".png", sep=""), width = length(data[,1]), height = length(data[,1])) +image(t(as.matrix(data)), col = grey(seq(1, 0, length = 256)), xaxt='n', yaxt='n') +dev.off() diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/plot_diags.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/plot_diags.R Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,179 @@ +#!/usr/bin/env Rscript +args = commandArgs(trailingOnly=TRUE) + +if(length(args) < 1){ + stop("USE: Rscript --vanilla plot.R ") +} + + +path_mat = args[1] + + + + +data <- as.matrix(read.csv(path_mat, sep = " ")) +percentage <- 0.1 + + + + +d <- col(data) - row(data) +groups <- split(data, d) +acum <- c() +for(i in 1:length(groups)){ + acum <- c(acum, sum(unlist(groups[i]))) +} + + +# reverse +data_r <- data +for(i in 1:length(data_r[,1])){ + data_r[i,] <- rev(data_r[i,]) +} +d <- col(data_r) - row(data_r) +groups <- split(data_r, d) +acum_r <- c() +for(i in 1:length(groups)){ + acum_r <- c(acum, sum(unlist(groups[i]))/length(unlist(groups[i])) ) +} + +indexes <- matrix(0, nrow=length(acum), ncol = 3) +indexes[,1] <- c(1:length(acum)) +indexes[,2] <- acum + +#sort +indexes[order(indexes[,2], decreasing=TRUE),] + +#crop by percentage +n_percent <- percentage * length(acum) + +for(i in 1:n_percent){ + indexes[i,3] <- 1 +} + +# Now resort based on first column to have them again in order +indexes[order(indexes[,1], decreasing=FALSE),] + + +# same in reverse +#put into indexes to sort +indexes_r <- matrix(0, nrow=length(acum_r), ncol = 3) +indexes_r[,1] <- c(1:length(acum_r)) +indexes_r[,2] <- acum_r + +#sort +indexes_r[order(indexes_r[,2], decreasing=TRUE),] + +#crop by percentage +n_percent <- percentage * length(acum_r) + +for(i in 1:n_percent){ + indexes_r[i,3] <- 1 +} + +# Now resort based on first column to have them again in order +indexes_r[order(indexes_r[,1], decreasing=FALSE),] + + + + +finalmat <- matrix(0, nrow=length(data[,1]), ncol=length(data[1,])) + + +for(i in length(data[1,]):1){ + x <- i + y <- 1 + + if(indexes[i,3] == 1){ + while(x < length(data[1,]) && y <= length(data[,1])){ + + + if(x < length(data[1,]) && y <= length(data[,1])){ + finalmat[x,y] <- 1 + } + + y <- y + 1 + x <- x + 1 + + + } + } + +} + +for(i in 2:length(data[,1])){ + x <- 1 + y <- i + + if(indexes[i,3] == 1){ + + while(x < length(data[1,]) && y <= length(data[,1])){ + + if(x < length(data[1,]) && y <= length(data[,1])){ + finalmat[x,y] <- 1 + } + y <- y + 1 + x <- x + 1 + + } + } +} + + +#same for reverse +for(i in length(data_r[1,]):1){ + x <- i + y <- 1 + + if(indexes[i,3] == 1){ + while(x < length(data_r[1,]) && y <= length(data_r[,1])){ + + + if(x < length(data_r[1,]) && y <= length(data_r[,1])){ + finalmat[x,length(data_r[,1])-y] <- 1 + } + + y <- y + 1 + x <- x + 1 + + + } + } + +} + +for(i in 2:length(data_r[,1])){ + x <- 1 + y <- i + + if(indexes[i,3] == 1){ + + while(x < length(data_r[1,]) && y <= length(data_r[,1])){ + + if(x < length(data_r[1,]) && y <= length(data_r[,1])){ + finalmat[x,length(data_r[,1])-y] <- 1 + } + y <- y + 1 + x <- x + 1 + + } + } +} + +# fully write last + +for(i in 1:length(data[,1])){ + for(j in 1:length(data[1,])){ + if(finalmat[i,j] == 0){ + + data[i,j] <- 0 + } + } +} + +png(paste(path_mat, ".png", sep=""), width = length(data[,1]), height = length(data[,1])) +image((data), col = grey(seq(1, 0, length = 256)), xaxt='n', yaxt='n') +dev.off() + + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/recompute_scores.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/recompute_scores.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +if [ $# -ne 1 ]; then + echo " ==== ERROR ... you called this script inappropriately." + echo "" + echo " usage: $0 <#threads>" + echo "" + exit -1 +fi + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +THREADS=$1 + + +for p in */; +do + + echo "Entering $p" + cd $p + + + + thepaths=() + n=0 + # Grab the routes + for i in *.mat ; + do + + thepaths[$n]=$i + n=`expr $n + 1` + + done + + i=0 + aux=0 + for ((i=0; i < $n ; i+=$THREADS)) + do + aux=$i + for ((j=0; j<$THREADS; j++)) + do + if [ "$aux" -lt "$n" ]; then + echo "Recomputing ${thepaths[$aux]}" + goodpath=${thepaths[$aux]%.mat} + Rscript $BINDIR/compute_score.R ${thepaths[$aux]} > ${goodpath}.scr.txt & + aux=`expr $aux + 1` + fi + done + + for job in `jobs -p` + do + #echo $job + wait $job + done + + done + + + echo "Exiting $p" + cd .. + + +done + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/bin/run_and_plot_chromeister.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/bin/run_and_plot_chromeister.sh Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +G1=$1 +G2=$2 +KMER=$3 +DIM=$4 + + +FILE1=$(basename $G1) +FILE2=$(basename $G2) + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + + +if [ $# -lt 4 ]; then + echo "***ERROR*** Use: $0 " + exit -1 +fi + + + +(time $BINDIR/CHROMEISTER -query $G1 -db $G2 -kmer $KMER -out $FILE1-$FILE2.mat -dimension $DIM) &> $FILE1-$FILE2.log +(Rscript $BINDIR/compute_score.R $FILE1-$FILE2.mat $DIM) &> $FILE1-$FILE2.scr.txt + +#$BINDIR/plot.R $FILE1-$FILE2.mat + + +#rm $FILE1-$FILE2.mat diff -r 000000000000 -r 7fdf47a0bae8 chromeister/chromeister.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/chromeister.xml Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,14 @@ + + A heuristic approach for ultra fast previsualization of pairwise genome comparisons + + + + + + echo "\$PWD"; cp $query ${query.name}; cp $db ${db.name}; (/home/galaxy-bitlab/galaxy/tools/chromeister/bin/CHROMEISTER -query ${query.name} -db ${db.name} -dimension $dimension -out ${query.name}-${db.name}.mat) &>/dev/null ; rm ${query.name}; rm ${db.name}; Rscript /home/galaxy-bitlab/galaxy/tools/chromeister/bin/compute_score.R ${query.name}-${db.name}.mat $dimension; mv ${query.name}-${db.name}.mat $output; mv ${query.name}-${db.name}.mat.filt.png $outputIMAGEN ; mv ${query.name}-${db.name}.mat.events.txt $outputEVENTS; rm hits-XY-${query.name}-${db.name}.mat.hits + + + + + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/src/CHROMEISTER.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/src/CHROMEISTER.c Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,651 @@ +/********* + +File CHROMEISTER.c +Author EPW +Description Computes hits and generates a dotplot + +USAGE Usage is described by calling ./CHROMEISTER --help + + + +**********/ + + +#include +#include +#include +#include +#include +#include "structs.h" +#include "alignmentFunctions.h" +#include "commonFunctions.h" +#define STARTING_SEQS 1000 +#define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes +#define RANGE 2 + +uint64_t custom_kmer = 32; // Defined as external in structs.h +uint64_t diffuse_z = 4; // Defined as external in structs.h + +uint64_t get_seq_len(FILE * f); + + +void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * custom_kmer, uint64_t * dimension, uint64_t * diffuse_z); + +int main(int argc, char ** av){ + + + /* + //Store positions of kmers + uint64_t n_pools_used = 0; + //Mempool_l * mp = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l)); + //if(mp == NULL) terror("Could not allocate vectors for memory pools"); + Mempool_l mp[MAX_MEM_POOLS]; + init_mem_pool_llpos(&mp[n_pools_used]); + //llpos * aux; + + uint64_t n_pools_used_AVL = 0; + Mempool_AVL mp_AVL[MAX_MEM_POOLS]; + init_mem_pool_AVL(&mp_AVL[n_pools_used_AVL]); + */ + Tuple_hits * thit; + + /* + AVLTree * root = NULL; + root = insert_AVLTree(root, 10, mp_AVL, &n_pools_used_AVL, 0, mp, &n_pools_used); + + llpos * some = find_AVLTree(root, 25); + while(some != NULL){ + printf("#%"PRIu64", ", some->pos); some = some->next; + } + */ + + uint64_t i, j; + + //query to read kmers from, database to find seeds + FILE * query = NULL, * database = NULL, * out_database = NULL; + uint64_t dimension = 1000; // Default 1000 * 1000 + + + init_args(argc, av, &query, &database, &out_database, &custom_kmer, &dimension, &diffuse_z); + + + + unsigned char char_converter[91]; + char_converter[(unsigned char)'A'] = 0; + char_converter[(unsigned char)'C'] = 1; + char_converter[(unsigned char)'G'] = 2; + char_converter[(unsigned char)'T'] = 3; + + + // Variables to account for positions + // Print info + fprintf(stdout, "[INFO] Loading database\n"); + // Variables to read kmers + char c = 'N'; //Char to read character + // Current length of array and variables for the buffer + uint64_t idx = 0, r = 0; + + //Vector to read in batches + char * temp_seq_buffer = NULL; + if ((temp_seq_buffer = calloc(READBUF, sizeof(char))) == NULL) { + terror("Could not allocate memory for read buffer"); + } + + //Dimensional matrix + uint64_t ** representation = (uint64_t **) calloc(dimension+1, sizeof(uint64_t *)); + if(representation == NULL) terror("Could not allocate representation"); + for(i=0; i'){ + + + + while(c != '\n') c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); //Skip ID + + + while(c != '>' && (!feof(database) || (feof(database) && idx < r))){ //Until next id + c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); + c = toupper(c); + if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ + curr_kmer[word_size] = (unsigned char) c; + if(word_size < custom_kmer) ++word_size; + ++current_len; + if(current_len % a_hundreth == 0){ + fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/aprox_len_query); + //printf("%"PRIu64"%%..wasted: (%e) (%e)", 1+100*pos_in_query/aprox_len_query, (double)(wasted_cycles_forward)/CLOCKS_PER_SEC, (double)(wasted_cycles_reverse)/CLOCKS_PER_SEC); + fflush(stdout); + } + + + + }else{ //It can be anything (including N, Y, X ...) + + if(c != '\n' && c != '>'){ + word_size = 0; + // data_database.sequences[pos_in_database++] = (unsigned char) 'N'; //Convert to N + ++current_len; + + } + } + //if(current_len % 1000000 == 0) printf(" curr len %" PRIu64"\n", current_len); + if(word_size == custom_kmer){ + //write to hash table + + + thit = &ctidx->table[char_converter[curr_kmer[0]]][char_converter[curr_kmer[1]]][char_converter[curr_kmer[2]]] + [char_converter[curr_kmer[3]]][char_converter[curr_kmer[4]]][char_converter[curr_kmer[5]]] + [char_converter[curr_kmer[6]]][char_converter[curr_kmer[7]]][char_converter[curr_kmer[8]]] + [char_converter[curr_kmer[9]]][char_converter[curr_kmer[10]]][char_converter[curr_kmer[11]]]; + + /* + typedef struct tuple_hits{ + int repetition; + int hit_count; + uint64_t key; + uint64_t pos; + } Tuple_hits; + */ + + if(thit->repetition == FALSE){ + // Then we can insert + thit->hit_count = 0; + thit->key = collisioned_hash(&curr_kmer[0], custom_kmer); + thit->pos = current_len; + }else{ + // Otherwise we break it + thit->repetition = TRUE; + } + + //thit->root = insert_AVLTree(thit->root, hashOfWord(&curr_kmer[0], custom_kmer, FIXED_K), mp_AVL, &n_pools_used_AVL, current_len, mp, &n_pools_used); + //thit->root = insert_AVLTree(thit->root, collisioned_hash(&curr_kmer[0], custom_kmer), mp_AVL, &n_pools_used_AVL, current_len, mp, &n_pools_used); + + + + // Non overlapping + word_size = 0; + + + // Overlapping + //memmove(&curr_kmer[0], &curr_kmer[1], custom_kmer-1); + //--word_size; + } + } + word_size = 0; + + }else{ + c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); + } + + } + + + //end = clock(); + + // data_database.total_len = pos_in_database; + + //fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64". Hash table building took %e seconds\n", data_database.total_len, (double)(end-begin)/CLOCKS_PER_SEC); + fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64".\n", current_len); + //close database + fclose(database); + + + + //begin = clock(); + + + + + + double pixel_size_db = (double) dimension / (double) current_len; + double ratio_db = (double) current_len / dimension; + + + // Get file length + + //fseek(query, 0, SEEK_END); + //aprox_len_query = ftell(query); + //rewind(query); + aprox_len_query = get_seq_len(query); + + //uint64_t reallocs_hash_holder_table = 1; + //uint64_t n_items_hash_holder_table = aprox_len_query / 5; + + //Hash_holder * hash_holder_table = (Hash_holder *) calloc(n_items_hash_holder_table, sizeof(Hash_holder)); + //if(hash_holder_table == NULL) terror("Could not allocate hash holding table"); + + a_hundreth = (aprox_len_query/100); + double pixel_size_query = (double) dimension / (double) aprox_len_query; + double ratio_query = (double) aprox_len_query / dimension; + + + double i_r_fix = MAX(1.0, custom_kmer * pixel_size_query); + double j_r_fix = MAX(1.0, custom_kmer * pixel_size_db); + + + + fprintf(stdout, "[INFO] Ratios: Q [%e] D [%e]. Lenghts: Q [%"PRIu64"] D [%"PRIu64"]\n", ratio_query, ratio_db, aprox_len_query, current_len); + fprintf(stdout, "[INFO] Pixel size: Q [%e] D [%e].\n", pixel_size_query, pixel_size_db); + + + fprintf(stdout, "[INFO] Computing absolute hit numbers.\n"); + + + current_len = 0; + + //llpos * the_original_hit; + + //To force reading from the buffer + idx = READBUF + 1; + c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); + //uint64_t c_hash_holder = 0; + + while((!feof(query) || (feof(query) && idx < r))){ + + if(c == '>'){ + word_size = 0; + word_size_rev = custom_kmer-1; + + + + + while(c != '\n'){ c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); } //Skip ID + + + while(c != '>' && (!feof(query) || (feof(query) && idx < r))){ //Until next id + c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); + c = toupper(c); + if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ + + ++current_len; + if(current_len % a_hundreth == 0){ + fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/aprox_len_query); + fflush(stdout); + } + curr_kmer[word_size] = (unsigned char) c; + ++word_size; + + switch(c){ + case ('A'): reverse_kmer[word_size_rev] = (unsigned)'T'; + break; + case ('C'): reverse_kmer[word_size_rev] = (unsigned)'G'; + break; + case ('G'): reverse_kmer[word_size_rev] = (unsigned)'C'; + break; + case ('T'): reverse_kmer[word_size_rev] = (unsigned)'A'; + break; + } + if(word_size_rev != 0) --word_size_rev; + + + + + if(word_size == custom_kmer){ + + + //hash_forward = hashOfWord(&curr_kmer[0], custom_kmer, FIXED_K); + //hash_reverse = hashOfWord(&reverse_kmer[0], custom_kmer, FIXED_K); + uint64_t hash_forward, hash_reverse; + hash_forward = collisioned_hash(&curr_kmer[0], custom_kmer); + hash_reverse = collisioned_hash(&reverse_kmer[0], custom_kmer); + + + thit = &ctidx->table[char_converter[curr_kmer[0]]][char_converter[curr_kmer[1]]][char_converter[curr_kmer[2]]] + [char_converter[curr_kmer[3]]][char_converter[curr_kmer[4]]][char_converter[curr_kmer[5]]] + [char_converter[curr_kmer[6]]][char_converter[curr_kmer[7]]][char_converter[curr_kmer[8]]] + [char_converter[curr_kmer[9]]][char_converter[curr_kmer[10]]][char_converter[curr_kmer[11]]]; + + //AVLTree * search = find_AVLTree(thit->root, hash_forward); + + if(thit->repetition == FALSE && hash_forward == thit->key){ + // Attention ::::: you were not removing the ones with count==1 earlier + thit->pos_in_y = current_len; + thit->hit_count++; + } + + thit = &ctidx->table[char_converter[reverse_kmer[0]]][char_converter[reverse_kmer[1]]][char_converter[reverse_kmer[2]]] + [char_converter[reverse_kmer[3]]][char_converter[reverse_kmer[4]]][char_converter[reverse_kmer[5]]] + [char_converter[reverse_kmer[6]]][char_converter[reverse_kmer[7]]][char_converter[reverse_kmer[8]]] + [char_converter[reverse_kmer[9]]][char_converter[reverse_kmer[10]]][char_converter[reverse_kmer[11]]]; + + if(thit->repetition == FALSE && hash_reverse == thit->key){ + // Attention ::::: you were not removing the ones with count==1 earlier + thit->pos_in_y = current_len; + thit->hit_count++; + } + + /* + if(search != NULL && search->count == 1){ //If count is two, then it is a rep + thit->hit_count += search->count; + + hash_holder_table[c_hash_holder].pos = current_len; + hash_holder_table[c_hash_holder].node = search; + hash_holder_table[c_hash_holder].th = thit; + ++c_hash_holder; + if(c_hash_holder == n_items_hash_holder_table*reallocs_hash_holder_table){ + ++reallocs_hash_holder_table; + hash_holder_table = (Hash_holder *) realloc(hash_holder_table, n_items_hash_holder_table*reallocs_hash_holder_table*sizeof(Hash_holder)); + if(hash_holder_table == NULL) terror("Could not realloc hash holder table"); + } + } + */ + + + + + + //search = find_AVLTree(thit->root, hash_reverse); + /* + if(search != NULL && search->count == 1){ //If count is two, then it is a rep + + thit->hit_count += search->count; + hash_holder_table[c_hash_holder].pos = current_len; + hash_holder_table[c_hash_holder].node = search; + hash_holder_table[c_hash_holder].th = thit; + ++c_hash_holder; + if(c_hash_holder == n_items_hash_holder_table*reallocs_hash_holder_table){ + ++reallocs_hash_holder_table; + hash_holder_table = (Hash_holder *) realloc(hash_holder_table, n_items_hash_holder_table*reallocs_hash_holder_table*sizeof(Hash_holder)); + if(hash_holder_table == NULL) terror("Could not realloc hash holder table"); + } + } + */ + + // Overlapping + + memmove(&curr_kmer[0], &curr_kmer[1], custom_kmer-1); + memmove(&reverse_kmer[1], &reverse_kmer[0], custom_kmer-1); + --word_size; + + // Non overlapping + //word_size = 0; + //word_size_rev = custom_kmer-1; + } + }else{ + if(c != '\n' && c != '>'){ + word_size = 0; + word_size_rev = custom_kmer-1; + ++current_len; + } + } + } + }else{ + c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); + } + + } + + /// Out + + fprintf(stdout, "Scanning hits table.\n"); + + a_hundreth = MAX(1, TOTAL_ENTRIES/100); + uint64_t t_computed = 0; + uint64_t w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11; + for(w0=0;w0<4;w0++){ + for(w1=0;w1<4;w1++){ + for(w2=0;w2<4;w2++){ + for(w3=0;w3<4;w3++){ + for(w4=0;w4<4;w4++){ + for(w5=0;w5<4;w5++){ + for(w6=0;w6<4;w6++){ + for(w7=0;w7<4;w7++){ + for(w8=0;w8<4;w8++){ + for(w9=0;w9<4;w9++){ + for(w10=0;w10<4;w10++){ + for(w11=0;w11<4;w11++){ + + if(t_computed % a_hundreth == 0){ + fprintf(stdout, "\r%"PRIu64"%%...", 1+100*t_computed/TOTAL_ENTRIES); + fflush(stdout); + } + ++t_computed; + Tuple_hits * taux = &ctidx->table[w0][w1][w2][w3][w4][w5][w6][w7][w8][w9][w10][w11]; + if(taux->hit_count == 1){ + // We plot it + // Convert scale to representation + uint64_t redir_db = (uint64_t) (taux->pos / (ratio_db)); + uint64_t redir_query = (uint64_t) (taux->pos_in_y / (ratio_query)); + double i_r = i_r_fix; double j_r = j_r_fix; + while((uint64_t) i_r >= 1 && (uint64_t) j_r >= 1){ + if((int64_t) redir_query - (int64_t) i_r > 0 && (int64_t) redir_db - (int64_t) j_r > 0){ + representation[(int64_t) redir_query - (int64_t) i_r][(int64_t) redir_db - (int64_t) j_r]++; + }else{ + representation[redir_query][redir_db]++; + break; + } + i_r -= MIN(1.0, pixel_size_query); + j_r -= MIN(1.0, pixel_size_db); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + + + //double average_hit = ((double) total_hits / (double) table_size); + //average_hit = 2.2; + + /* + //fprintf(stdout, "[INFO] Total hit count is %"PRIu64" on a size of %"PRIu64" Avg = %e.\n", total_hits, table_size, average_hit); + fprintf(stdout, "[INFO] Total hit count is %"PRIu64" on a size of %"PRIu64" E' = %e.\n", total_hits, table_size, Eprime); + + + + + + a_hundreth = MAX(1,c_hash_holder/100); + + for(current_len = 0; current_len < c_hash_holder; current_len++){ + + if(current_len % a_hundreth == 0){ + fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/c_hash_holder); + fflush(stdout); + } + + aux = hash_holder_table[current_len].node->next; + + //if(hash_holder_table[current_len].th->hit_count < (uint64_t) average_hit){ + if(hash_holder_table[current_len].th->hit_count < (uint64_t) Eprime){ + while(aux != NULL){ + // Convert scale to representation + uint64_t redir_db = (uint64_t) (aux->pos / (ratio_db)); + uint64_t redir_query = (uint64_t) (hash_holder_table[current_len].pos / (ratio_query)); + double i_r = i_r_fix; double j_r = j_r_fix; + while((uint64_t) i_r >= 1 && (uint64_t) j_r >= 1){ + if((int64_t) redir_query - (int64_t) i_r > 0 && (int64_t) redir_db - (int64_t) j_r > 0){ + representation[(int64_t) redir_query - (int64_t) i_r][(int64_t) redir_db - (int64_t) j_r]++; + }else{ + representation[redir_query][redir_db]++; + break; + } + i_r -= MIN(1.0, pixel_size_query); + j_r -= MIN(1.0, pixel_size_db); + } + aux = aux->next; + } + } + } + */ + + + //end = clock(); + + + + + //fprintf(stdout, "\n[INFO] Query length %"PRIu64". Hits completed. Took %e seconds\n", data_query.total_len, (double)(end-begin)/CLOCKS_PER_SEC); + fprintf(stdout, "\n[INFO] Query length %"PRIu64".\n", current_len); + + //begin = clock(); + + //reads_per_thread = (uint64_t) (floorl((long double) data_query.n_seqs / (long double) n_threads)); + + fprintf(stdout, "[INFO] Writing matrix.\n"); + + + uint64_t unique_diffuse = 0; + fprintf(out_database, "%"PRIu64"\n", aprox_len_query); + fprintf(out_database, "%"PRIu64"\n", aprox_len_db); + // And replace 2's with 1's + + for(i=0; itable); + //free(hash_holder_table); + /* + for(i=0;i<=n_pools_used_AVL;i++){ + free(mp_AVL[i].base); + } + + for(i=0;i<=n_pools_used;i++){ + free(mp[i].base); + } + */ + for(i=0;i1 (default 32)]\n"); + fprintf(stdout, " -diffuse [Integer: z>0 (default 4)]\n"); + fprintf(stdout, " -dimension Size of the output [Integer: d>0 (default 1000)]\n"); + fprintf(stdout, " -out [File path]\n"); + fprintf(stdout, " --help Shows help for program usage\n"); + fprintf(stdout, "\n"); + fprintf(stdout, "PLEASE NOTICE: The reverse complementary is calculated for the QUERY.\n"); + exit(1); + } + if(strcmp(av[pNum], "-query") == 0){ + *query = fopen64(av[pNum+1], "rt"); + if(query==NULL) terror("Could not open query file"); + } + if(strcmp(av[pNum], "-db") == 0){ + *database = fopen64(av[pNum+1], "rt"); + if(database==NULL) terror("Could not open database file"); + } + if(strcmp(av[pNum], "-out") == 0){ + *out_database = fopen(av[pNum+1], "wt"); + if(out_database==NULL) terror("Could not open output database file"); + } + if(strcmp(av[pNum], "-kmer") == 0){ + *custom_kmer = (uint64_t) atoi(av[pNum+1]); + if(*custom_kmer < BYTES_IN_MER) terror("K-mer size must be larger than 4"); + if(*custom_kmer % BYTES_IN_MER != 0) terror("K-mer size must be a multiple of 4"); + + } + if(strcmp(av[pNum], "-diffuse") == 0){ + *diffuse_z = (uint64_t) atoi(av[pNum+1]); + if(*diffuse_z == 0 || *diffuse_z > 32) terror("Z must satisfy 0'){ + while(c != '\n') c = getc(f); + } + c = toupper(c); + if(c >= 'A' && c <= 'Z'){ + ++l; + } + } + + + rewind(f); + return l; +} diff -r 000000000000 -r 7fdf47a0bae8 chromeister/src/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/src/Makefile Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,14 @@ +CC=gcc +CXX=g++ +CFLAGS=-O3 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -Wall #-DVERBOSE +BIN=../bin + +all: CHROMEISTER + + + +CHROMEISTER: CHROMEISTER.c + $(CC) $(CFLAGS) alignmentFunctions.c -lm commonFunctions.c -lm CHROMEISTER.c -lm -o $(BIN)/CHROMEISTER + +clean: + rm -rf $(BIN)/CHROMEISTER diff -r 000000000000 -r 7fdf47a0bae8 chromeister/src/alignmentFunctions.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/src/alignmentFunctions.c Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,266 @@ +#include +#include +#include +#include +#include +#include +#include +#include "structs.h" +#include "alignmentFunctions.h" +#include "commonFunctions.h" + + + +int64_t compare_letters(unsigned char a, unsigned char b){ + if(a != (unsigned char) 'N' && a != (unsigned char) '>') return (a == b) ? POINT : -POINT; + return -POINT; +} + +llpos * getNewLocationllpos(Mempool_l * mp, uint64_t * n_pools_used){ + + if(mp[*n_pools_used].current == POOL_SIZE){ + *n_pools_used += 1; + if(*n_pools_used == MAX_MEM_POOLS) terror("Reached max pools"); + init_mem_pool_llpos(&mp[*n_pools_used]); + + } + + llpos * new_pos = mp[*n_pools_used].base + mp[*n_pools_used].current; + mp[*n_pools_used].current++; + + + return new_pos; +} + +void init_mem_pool_llpos(Mempool_l * mp){ + mp->base = (llpos *) calloc(POOL_SIZE, sizeof(llpos)); + if(mp->base == NULL) terror("Could not request memory pool"); + mp->current = 0; +} + +AVLTree * getNewLocationAVLTree(Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t key){ + + if(mp[*n_pools_used].current == POOL_SIZE){ + *n_pools_used += 1; + if(*n_pools_used == MAX_MEM_POOLS) terror("Reached max pools"); + init_mem_pool_AVL(&mp[*n_pools_used]); + + } + + AVLTree * new_pos = mp[*n_pools_used].base + mp[*n_pools_used].current; + mp[*n_pools_used].current++; + + new_pos->key = key; + new_pos->count = 1; + new_pos->height = 1; + + return new_pos; +} + +void init_mem_pool_AVL(Mempool_AVL * mp){ + mp->base = (AVLTree *) calloc(POOL_SIZE, sizeof(AVLTree)); + if(mp->base == NULL) terror("Could not request memory pool"); + mp->current = 0; +} + + + +/* +// An AVL tree node +typedef struct AVL_Node{ + uint64_t key; + struct AVL_Node * left; + struct AVL_Node * right; + uint64_t height; + llpos * next; +} AVLTree; +*/ + +// A utility function to get height of the tree + +uint64_t height(AVLTree * N){ + if (N == NULL) + return 0; + return N->height; +} + +/* Substituted by (x == NULL) ? (0) : (x->height) */ + +/* Helper function that allocates a new node with the given key and + NULL left and right pointers. */ + +/* This one is substituted by AVLTree * getNewLocationAVLTree(Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t key) */ + +// A utility function to right rotate subtree rooted with y +// See the diagram given above. +AVLTree * right_rotate(AVLTree * y){ + AVLTree * x = y->left; + AVLTree * T2 = x->right; + + // Perform rotation + x->right = y; + y->left = T2; + + // Update heights + //x->height = MAX((x == NULL) ? (0) : (x->left->height), (x == NULL) ? (0) : (x->right->height))+1; + //y->height = MAX((y == NULL) ? (0) : (y->left->height), (y == NULL) ? (0) : (y->right->height))+1; + // Update heights + y->height = MAX(height(y->left), height(y->right))+1; + x->height = MAX(height(x->left), height(x->right))+1; + + // Return new root + return x; +} + +// A utility function to left rotate subtree rooted with x +// See the diagram given above. +AVLTree * left_rotate(AVLTree * x){ + AVLTree * y = x->right; + AVLTree * T2 = y->left; + + // Perform rotation + y->left = x; + x->right = T2; + + // Update heights + //x->height = MAX((x == NULL) ? (0) : (x->left->height), (x == NULL) ? (0) : (x->right->height))+1; + //y->height = MAX((y == NULL) ? (0) : (y->left->height), (y == NULL) ? (0) : (y->right->height))+1; + x->height = MAX(height(x->left), height(x->right))+1; + y->height = MAX(height(y->left), height(y->right))+1; + + // Return new root + return y; +} + +// Get Balance factor of node N + +int64_t get_balance(AVLTree * N){ + if (N == NULL) + return 0; + return height(N->left) - height(N->right); +} + +/* Substituted by (node == NULL) ? (0) : ((int64_t) node->left->height - (int64_t) node->right->height) */ + +AVLTree * find_AVLTree(AVLTree * node, uint64_t key){ + AVLTree * found = NULL; + if(node == NULL) return NULL; + + if (key < node->key) { + found = find_AVLTree(node->left, key); + } else if (key > node->key) { + found = find_AVLTree(node->right, key); + } else { + return node; + } + return found; +} + +llpos * find_AVLTree_llpos(AVLTree * node, uint64_t key){ + llpos * aux = NULL; + if(node == NULL) return NULL; + + if (key < node->key) { + aux = find_AVLTree_llpos(node->left, key); + } else if (key > node->key) { + aux = find_AVLTree_llpos(node->right, key); + } else { + return node->next; + } + return aux; +} + +// Recursive function to insert key in subtree rooted +// with node and returns new root of subtree. +AVLTree * insert_AVLTree(AVLTree * node, uint64_t key, Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t pos, Mempool_l * mp_l, uint64_t * n_pools_used_l){ + /* 1. Perform the normal BST insertion */ + if (node == NULL){ + + AVLTree * n_node = getNewLocationAVLTree(mp, n_pools_used, key); + llpos * aux = getNewLocationllpos(mp_l, n_pools_used_l); + aux->pos = pos; + n_node->next = aux; + return n_node; + } + + if (key < node->key) { + node->left = insert_AVLTree(node->left, key, mp, n_pools_used, pos, mp_l, n_pools_used_l); + } else if (key > node->key) { + node->right = insert_AVLTree(node->right, key, mp, n_pools_used, pos, mp_l, n_pools_used_l); + } else { + // Equal keys are inserted as a linked list + if(node->count == 1){ // DO NOT INSERT MORE THAN 2. 1 is good 2 is repetition + llpos * aux = getNewLocationllpos(mp_l, n_pools_used_l); + aux->pos = pos; + aux->next = node->next; + node->next = aux; + ++(node->count); + } + + return node; + } + + /* 2. Update height of this ancestor node */ + //node->height = 1 + MAX((node->left == NULL) ? (0) : (node->left->height), (node->right == NULL) ? (0) : (node->right->height)); + node->height = 1 + MAX(height(node->left), height(node->right)); + + /* 3. Get the balance factor of this ancestor + node to check whether this node became + unbalanced */ + //int64_t balance = (node->left == NULL || node->right == NULL) ? (0) : ((int64_t) node->left->height - (int64_t) node->right->height); + int64_t balance = get_balance(node); + + // If this node becomes unbalanced, then + // there are 4 cases + + // Left Left Case + if (balance > 1 && key < node->left->key) + return right_rotate(node); + + // Right Right Case + if (balance < -1 && key > node->right->key) + return left_rotate(node); + + // Left Right Case + if (balance > 1 && key > node->left->key) + { + node->left = left_rotate(node->left); + return right_rotate(node); + } + + // Right Left Case + if (balance < -1 && key < node->right->key) + { + node->right = right_rotate(node->right); + return left_rotate(node); + } + + /* return the (unchanged) node pointer */ + return node; +} + +// A utility function to print preorder traversal +// of the tree. +// The function also prints height of every node + +void pre_order(AVLTree * root){ + if(root != NULL){ + printf("%"PRIu64" ", root->key); + llpos * aux = root->next; + while(aux != NULL){ printf("#%"PRIu64", ", aux->pos); aux = aux->next; } + pre_order(root->left); + pre_order(root->right); + } +} + + +uint64_t sum_of_all_tree(AVLTree * root){ + uint64_t mysum = 0; + if(root != NULL){ + + mysum = root->count; + mysum += sum_of_all_tree(root->left); + mysum += sum_of_all_tree(root->right); + } + return mysum; +} \ No newline at end of file diff -r 000000000000 -r 7fdf47a0bae8 chromeister/src/alignmentFunctions.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/src/alignmentFunctions.h Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,49 @@ +#define QF_LAMBDA 0.275 +#define QF_KARLIN 0.333 + +typedef struct container{ + llpos * table[4][4][4][4][4][4][4][4][4][4][4][4]; +} Container; + +typedef struct index{ + Tuple_hits table[4][4][4][4][4][4][4][4][4][4][4][4]; +} Index; + +/* + Nucleotides matching function +*/ +int64_t compare_letters(unsigned char a, unsigned char b); + +/** + * Initialize the memory pool to later retrieve individual memory addresses for llpos + * + */ +void init_mem_pool_llpos(Mempool_l * mp); + +/** + * Get a new memory address from the pool mp for a type llpos + * + */ +llpos * getNewLocationllpos(Mempool_l * mp, uint64_t * n_pools_used); + + + +AVLTree * getNewLocationAVLTree(Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t key); + + +void init_mem_pool_AVL(Mempool_AVL * mp); + + +AVLTree * right_rotate(AVLTree * y); + +AVLTree * left_rotate(AVLTree * x); + +AVLTree * find_AVLTree(AVLTree * node, uint64_t key); + +llpos * find_AVLTree_llpos(AVLTree * node, uint64_t key); + +AVLTree * insert_AVLTree(AVLTree * node, uint64_t key, Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t pos, Mempool_l * mp_l, uint64_t * n_pools_used_l); + +void pre_order(AVLTree * root); + +uint64_t sum_of_all_tree(AVLTree * root); diff -r 000000000000 -r 7fdf47a0bae8 chromeister/src/combine_reads.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/src/combine_reads.c Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) <= (y)) ? (x) : (y)) +#define STARTING_SEQS 1000 +#define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes +#define MATVAL 101 + +int main(int argc, char ** av){ + + if(argc != 2) terror("USE:: combine_reads "); + + FILE * results, * data; + data = fopen(av[1], "rt"); + if(data == NULL) terror("Could not open input file"); + + uint64_t * mat = (uint64_t *) calloc(MATVAL, sizeof(uint64_t)); + if(mat == NULL) terror("Could not allocate matrix array"); + + char buffer[MAXLID]; + if ((results = fopen("accu.log", "r")) == NULL){ + results = fopen("accu.log", "wt"); + }else{ + // Load the matrix + uint64_t i; + + for(i=0;i<100;i++){ + if(0 == fgets(buffer, MAXLID, results)) terror("Missing number on load"); + + //fprintf(stdout, "Have %s\n", buffer); + buffer[strlen(buffer)-1] = '\0'; + mat[i] = asciiToUint64(buffer); + //fprintf(stdout, "%"PRIu64"\n", mat[i]); + //getchar(); + } + fclose(results); + results = fopen("accu.log", "wt"); // Re open + } + + // Read file + uint64_t read_id_1, read_id_2, coverage, identity, length, current, currmax, j; + while(!feof(data)){ + if(0 == fgets(buffer, MAXLID, data) && !feof(data)) terror("Missing values"); + // 2 77277 89 64 213 + sscanf(buffer, "%"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64, &read_id_1, &read_id_2, &coverage, &identity, &length); + //fprintf(stdout, "%"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64"\n", read_id_1, read_id_2, coverage, identity, length); + currmax = MIN(identity, coverage); + //fprintf(stdout, "%"PRIu64"\n", currmax); + current = read_id_1; + /* + for(j=currmax; j > 1; j--){ + if(current != lasts[j]){ + mat[j]++; + lasts[j] = current; + } + } + */ + mat[currmax] += 1; + + } + + for(j=99; j>0; j--){ + mat[j] += mat[j+1]; + } + mat[0] = mat[1]; + + for(j=0; j<100; j++){ + fprintf(stdout, "%"PRIu64"\n", mat[j]); + fprintf(results, "%"PRIu64"\n", mat[j]); + } + + + fclose(results); + fclose(data); + free(mat); + return 0; +} \ No newline at end of file diff -r 000000000000 -r 7fdf47a0bae8 chromeister/src/commonFunctions.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/src/commonFunctions.c Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,218 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "structs.h" + +void terror(char *s) { + printf("ERR**** %s ****\n", s); + exit(-1); +} + +char buffered_fgetc(char *buffer, uint64_t *pos, uint64_t *read, FILE *f) { + if (*pos >= READBUF) { + *pos = 0; + memset(buffer, 0, READBUF); + *read = fread(buffer, 1, READBUF, f); + } + *pos = *pos + 1; + return buffer[*pos-1]; +} + +Queue * generate_queue(Head * queue_head, uint64_t t_reads, uint64_t n_threads, uint64_t levels){ + uint64_t i, j; + uint64_t reads_per_thread; + if(levels > t_reads) levels = t_reads; + uint64_t pieces = t_reads/levels; + uint64_t from, to, t_queues = 0, current_queue = 0; + for(i=0;ihead = &queues[0]; + + for(i=0;ir1, q->r2); +} + +Queue * get_task_from_queue(Head * queue_head, pthread_mutex_t * lock){ + pthread_mutex_lock(lock); + + Queue * ptr = queue_head->head; + if(queue_head->head != NULL) queue_head->head = queue_head->head->next; + //if(ptr != NULL){ printf("Taking "); /*print_queue(ptr);*/ } + + pthread_mutex_unlock(lock); + + + return ptr; +} + +uint64_t quick_pow4(uint64_t n){ + static uint64_t pow4[33]={1L, 4L, 16L, 64L, 256L, 1024L, 4096L, 16384L, 65536L, + 262144L, 1048576L, 4194304L, 16777216L, 67108864L, 268435456L, 1073741824L, 4294967296L, + 17179869184L, 68719476736L, 274877906944L, 1099511627776L, 4398046511104L, 17592186044416L, + 70368744177664L, 281474976710656L, 1125899906842624L, 4503599627370496L, 18014398509481984L, + 72057594037927936L, 288230376151711744L, 1152921504606846976L, 4611686018427387904L}; + return pow4[n]; +} + +uint64_t quick_pow4byLetter(uint64_t n, const char c){ + static uint64_t pow4_G[33]={2*1L, 2*4L, 2*16L, 2*64L, 2*256L, 2*1024L, 2*4096L, 2*16384L, 2*65536L, + (uint64_t)2*262144L, (uint64_t)2*1048576L,(uint64_t)2*4194304L, (uint64_t)2*16777216L, (uint64_t)2*67108864L, (uint64_t)2*268435456L, (uint64_t)2*1073741824L, (uint64_t)2*4294967296L, + (uint64_t)2*17179869184L, (uint64_t)2*68719476736L, (uint64_t)2*274877906944L, (uint64_t)2*1099511627776L, (uint64_t)2*4398046511104L, (uint64_t)2*17592186044416L, + (uint64_t)2*70368744177664L, (uint64_t)2*281474976710656L, (uint64_t)2*1125899906842624L, (uint64_t)2*4503599627370496L, (uint64_t)2*18014398509481984L, + (uint64_t)2*72057594037927936L, (uint64_t) 2*288230376151711744L, (uint64_t) 2*1152921504606846976L, (uint64_t) 2*4611686018427387904L}; + + static uint64_t pow4_T[33]={3*1L, 3*4L, 3*16L, 3*64L, 3*256L, 3*1024L, 3*4096L, 3*16384L, 3*65536L, + (uint64_t)3*262144L, (uint64_t) 3*1048576L, (uint64_t)3*4194304L, (uint64_t)3*16777216L, (uint64_t)3*67108864L, (uint64_t)3*268435456L, (uint64_t)3*1073741824L, (uint64_t)3*4294967296L, + (uint64_t)3*17179869184L, (uint64_t)3*68719476736L, (uint64_t)3*274877906944L, (uint64_t)3*1099511627776L, (uint64_t)3*4398046511104L, (uint64_t)3*17592186044416L, + (uint64_t)3*70368744177664L, (uint64_t)3*281474976710656L, (uint64_t)3*1125899906842624L, (uint64_t)3*4503599627370496L, (uint64_t)3*18014398509481984L, + (uint64_t)3*72057594037927936L, (uint64_t) 3*288230376151711744L, (uint64_t) 3*1152921504606846976L, (uint64_t) 3*4611686018427387904L}; + + if(c == 'A') return 0; + if(c == 'C') return quick_pow4(n); + if(c == 'G') return pow4_G[n]; + if(c == 'T') return pow4_T[n]; + return 0; +} + +uint64_t hashOfWord(const unsigned char * word, uint32_t k, uint64_t offset){ + + uint64_t value = 0, jIdx; + for(jIdx=0;jIdx= 0){ + v = (uint64_t) floor(hash / (pow(4, jIdx))); + if(v == 0){ word[upIdx++] = (unsigned char) 'A'; hash -= ((uint64_t) pow(4, jIdx) * 0); } + if(v == 1){ word[upIdx++] = (unsigned char) 'C'; hash -= ((uint64_t) pow(4, jIdx) * 1); } + if(v == 2){ word[upIdx++] = (unsigned char) 'G'; hash -= ((uint64_t) pow(4, jIdx) * 2); } + if(v == 3){ word[upIdx++] = (unsigned char) 'T'; hash -= ((uint64_t) pow(4, jIdx) * 3); } + + if(jIdx == 0) break; + --jIdx; + } +} + +uint64_t collisioned_hash(const unsigned char * word, uint32_t k){ + uint64_t jIdx, value = 0; + for(jIdx=0;jIdx +#include +#include +#include +#include +#include "commonFunctions.h" + +#define SEQSIZE 2000000000 +#define READINPUT 10000 +#define WSIZE 32 +#define NREADS 1000000 + +int main(int ac, char** av) { + FILE *fIn, *fOut; + int64_t i, j, nR, seqLen = 0; + char *seq, c, toW; + long *offset = NULL; + + if (ac != 3) + terror("USE: reverseComplement seqFile.IN reverseComplementarySeq.OUT"); + + /** + * Allocating memory for the sequence + * Fixed amount of memory, change it to loadSeqDB? + */ + if ((seq = (char*) malloc(SEQSIZE)) == NULL) + terror("memory for Seq"); + + if ((fIn = fopen(av[1], "rt")) == NULL) + terror("opening IN sequence FASTA file"); + + if ((fOut = fopen(av[2], "wt")) == NULL) + terror("opening OUT sequence Words file"); + + if ((offset = (long *) malloc(sizeof(long)*NREADS)) == NULL) + terror("memory for offsets"); + + for(i=0;i'){ + offset[nR++] = ftell(fIn)-1; + } + c = fgetc(fIn); + } + + for(i=nR-1; i>=0; i--){ + fseek(fIn, offset[i], SEEK_SET); + //READ and write header + if(fgets(seq, READINPUT, fIn)==NULL){ + terror("Empty file"); + } + fprintf(fOut, "%s", seq); + //READ and write sequence + c = fgetc(fIn); + while(c != '>' && !feof(fIn)){ + if(isupper(c) || islower(c)){ + seq[seqLen++]=c; + } + c = fgetc(fIn); + } + for(j=seqLen-1; j >= 0; j--){ + switch(seq[j]){ + case 'A': + toW = 'T'; + break; + case 'C': + toW = 'G'; + break; + case 'G': + toW = 'C'; + break; + case 'T': + toW = 'A'; + break; + case 'U': + toW = 'A'; + break; + case 'a': + toW = 't'; + break; + case 'c': + toW = 'g'; + break; + case 'g': + toW = 'c'; + break; + case 't': + toW = 'a'; + break; + case 'u': + toW = 'a'; + break; + default: + toW = seq[j]; + break; + } + fwrite(&toW, sizeof(char), 1, fOut); + } + toW='\n'; + fwrite(&toW, sizeof(char), 1, fOut); + seqLen=0; + } + + fclose(fIn); + fclose(fOut); + + return 0; +} + + diff -r 000000000000 -r 7fdf47a0bae8 chromeister/src/structs.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chromeister/src/structs.h Wed Dec 12 07:18:40 2018 -0500 @@ -0,0 +1,143 @@ +#ifndef STRUCTS_H +#define STRUCTS_H + +#include + +//Structs required for the dotplot workflow +#define MAXLID 200 +//#define READBUF 2000000000 //2 GB +#define READBUF 500000000 //50MB +#define INITSEQS 3000 //Number of starting sequences (in database) +#define POINT 4 + +#define FIXED_K 12 +#define TOTAL_ENTRIES 16777216 + +#define MAXLID 200 +#define ALIGN_LEN 60 //For NW alignment +#define MAX_READ_SIZE 20000 //Maximum length of read to have a portion of the table allocated +#define MAX_WINDOW_SIZE 500 //Maximum window length to explore NW table +//#define POOL_SIZE 2500000000 // by 16 bytes it is 40 GB +#define POOL_SIZE 12500000 // 1 GB if 16 bytes +#define MAX_MEM_POOLS 256 + +#define BYTES_IN_MER 4 +#define MAX_DECOMP_HASH 10 +#define FALSE 0 +#define TRUE 1 + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) <= (y)) ? (x) : (y)) + +extern uint64_t custom_kmer; +extern uint64_t diffuse_z; + +//Struct for linked list of positions +typedef struct linked_list_pos{ + uint64_t pos; + //uint64_t extended_hash; + //unsigned char decomp_hash[MAX_DECOMP_HASH]; // Fits up to MAX_DECOMP_HASH*4 letters = 256 length kmer + //uint64_t hits_count; + struct linked_list_pos * next; +} llpos; + + +// An AVL tree node +typedef struct AVL_Node{ + uint64_t key; + struct AVL_Node * left; + struct AVL_Node * right; + uint64_t height; + uint64_t count; + llpos * next; +} AVLTree; + + +// Tuple of data +typedef struct tuple_hits{ + int repetition; + int hit_count; + uint64_t key; + uint64_t pos; + uint64_t pos_in_y; +} Tuple_hits; + +typedef struct hash_holder{ + uint64_t key; + uint64_t pos; +} Hash_holder; + + + +//Struct for memory pool por lists +typedef struct mempool_l{ + llpos * base; + uint64_t current; +} Mempool_l; + +//Struct for memory pool por AVLtree +typedef struct mempool_AVL{ + AVLTree * base; + uint64_t current; +} Mempool_AVL; + +//Struct for a whole sequence(s) data +typedef struct seqinfo{ + unsigned char * sequences; + uint64_t * start_pos; + uint64_t total_len; + uint64_t n_seqs; +} SeqInfo; + +//Struct for the alignment of a quick hit +typedef struct quickfrag{ + uint64_t x_start; + uint64_t y_start; + uint64_t t_len; + long double coverage; + long double e_value; +} Quickfrag; + +typedef struct point{ + uint64_t x; + uint64_t y; +} Point; + + + +struct cell{ + int64_t score; + uint64_t xfrom; + uint64_t yfrom; +}; + +struct positioned_cell{ + int64_t score; + uint64_t xpos; + uint64_t ypos; +}; + +struct best_cell{ + struct positioned_cell c; + uint64_t j_prime; +}; + +typedef struct{ + uint64_t identities; + uint64_t length; + uint64_t igaps; + uint64_t egaps; +} BasicAlignment; + +typedef struct queue{ + uint64_t r1; //reads region + uint64_t r2; + struct queue * next; +} Queue; + +typedef struct{ + Queue * head; +} Head; + + +#endif