# HG changeset patch # User bitlab # Date 1544914738 18000 # Node ID 9db88f0f32b7d9fe8d7aaffbe169edba4f699968 Uploaded diff -r 000000000000 -r 9db88f0f32b7 gecko/.gitignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/.gitignore Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,9 @@ +* + +!.gitignore +!bin/*.sh +!bin/*.mat + +!src/* + +!*/ diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/FragHits Binary file gecko/bin/FragHits has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/allVsAll.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/bin/allVsAll.sh Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +DIR=$1 +L=$2 +S=$3 +WL=$4 +EXT=$5 + +array=() +x=0 + +if [ $# != 5 ]; then + echo "***ERROR*** Use: $0 genomesDirectory L(200) S(40) K(8) fastaFilesExtension" + exit -1 +fi + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +for elem in $(ls -d $DIR/*.$EXT | awk -F "/" '{print $NF}' | awk -F ".$EXT" '{print $1}') +do + array[$x]=$elem + x=`expr $x + 1` + #echo "X: $elem" +done + +for ((i=0 ; i < ${#array[@]} ; i++)) +do + for ((j=i ; j < ${#array[@]} ; j++)) + do + if [ $i != $j ]; then + seqX=${array[$i]} + seqY=${array[$j]} + echo "----------${seqX}-${seqY}-----------" + if [[ ! -f frags/${seqX}-${seqY}.frags ]]; then + + echo "${BINDIR}/workflow.sh $DIR/${seqX}.$EXT $DIR/${seqY}.$EXT $L $S $WL 1" + ${BINDIR}/workflow.sh $DIR/${seqX}.$EXT $DIR/${seqY}.$EXT $L $S $WL 1 + + fi + fi + done +done diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/combineFrags Binary file gecko/bin/combineFrags has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/comparison.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/bin/comparison.sh Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,52 @@ +#!/bin/bash + +FL=1000 # frequency limit + +if [ $# != 7 ]; then + echo " ==== ERROR ... you called this script inappropriately." + echo "" + echo " usage: $0 seqXName seqYName lenght similarity WL fixedL strand" + echo "" + exit -1 +fi + +seqXName=$(basename "$1") +extensionX="${seqXName##*.}" +seqXName="${seqXName%.*}" + +seqYName=$(basename "$2") +extensionY="${seqYName##*.}" +seqYName="${seqYName%.*}" + +#seqXName=`basename $1 .fasta` +#seqYName=`basename $2 .fasta` + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +length=${3} +similarity=${4} +WL=${5} # wordSize +fixedL=${6} +strand=${7} +distance=$((4*${WL})) + +if [[ ! -f ../hits/${seqXName}-${seqYName}-K${WL}.hits.sorted.filtered ]]; then + echo "${BINDIR}/hits ${seqXName} ${seqYName} ${seqXName}-${seqYName}-K${WL}.hits ${FL} ${WL}" + ${BINDIR}/hits ${seqXName} ${seqYName} ${seqXName}-${seqYName}-K${WL}.hits ${FL} ${WL} + + echo "${BINDIR}/sortHits 10000000 32 ${seqXName}-${seqYName}-K${WL}.hits ${seqXName}-${seqYName}-K${WL}.hits.sorted" + ${BINDIR}/sortHits 10000000 32 ${seqXName}-${seqYName}-K${WL}.hits ${seqXName}-${seqYName}-K${WL}.hits.sorted + + echo "${BINDIR}/filterHits ${seqXName}-${seqYName}-K${WL}.hits.sorted ${seqXName}-${seqYName}-K${WL}.hits.sorted.filtered ${WL}" + ${BINDIR}/filterHits ${seqXName}-${seqYName}-K${WL}.hits.sorted ${seqXName}-${seqYName}-K${WL}.hits.sorted.filtered ${WL} + + mv ${seqXName}-${seqYName}-K${WL}.hits.sorted.filtered ../hits/ +fi + +ln -s ../hits/${seqXName}-${seqYName}-K${WL}.hits.sorted.filtered . + +echo "${BINDIR}/FragHits $1 $2 ${seqXName}-${seqYName}-K${WL}.hits.sorted.filtered ${seqXName}-${seqYName}-s${strand}.frags ${length} ${similarity} ${WL} ${fixedL} ${strand}" +${BINDIR}/FragHits $1 $2 ${seqXName}-${seqYName}-K${WL}.hits.sorted.filtered ${seqXName}-${seqYName}-s${strand}.frags ${length} ${similarity} ${WL} ${fixedL} ${strand} + +echo "--------------------DONE------------------" + diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/dictionary.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/bin/dictionary.sh Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,27 @@ +#!/bin/bash + +FL=1000 # frequency limit +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +if [ $# != 1 ]; then + echo " ==== ERROR ... you called this script inappropriately." + echo "" + echo " usage: $0 seqXName.fasta" + echo "" + exit -1 +fi + +seqName=$(basename "$1") +extension="${seqName##*.}" +seqName="${seqName%.*}" + +# find words and order +echo "${BINDIR}/words $1 ${seqName}.words.unsort" +${BINDIR}/words $1 ${seqName}.words.unsort +echo "${BINDIR}/sortWords 10000000 32 ${seqName}.words.unsort ${seqName}.words.sort" +${BINDIR}/sortWords 10000000 32 ${seqName}.words.unsort ${seqName}.words.sort + +# Create hash table in disk +echo "${BINDIR}/w2hd ${seqName}.words.sort ${seqName}" +${BINDIR}/w2hd ${seqName}.words.sort ${seqName} + diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/filterFrags Binary file gecko/bin/filterFrags has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/filterHits Binary file gecko/bin/filterHits has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/fragStat Binary file gecko/bin/fragStat has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/frags2align.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/bin/frags2align.sh Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,51 @@ +#!/bin/bash + +if [ $# != 4 ]; then + echo " ==== ERROR ... you called this script inappropriately." + echo "" + echo " usage: $0 fragsFILE.frags fastaX fastaY alignments.txt" + echo "" + exit -1 +fi + +MGDIR=/home/galaxy-bitlab/temp + +FRAGS=$1 +FASTAX=$2 +FASTAY=$3 +ALIGN=$4 + +fragsName=$(basename "$1") +fastaXname=$(basename "$2") +fastaYname=$(basename "$3") +alignName=$(basename "$4") +fragsName="${fragsName%.*}" +fastaXname="${fastaXname%.*}" +fastaYname="${fastaYname%.*}" +alignName="${alignName%.*}" + +cp $1 $MGDIR/${fragsName}.frags +cp $2 $MGDIR/${fastaXname}.fasta +cp $3 $MGDIR/${fastaYname}.fasta + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# generate indices + +$BINDIR/indexmaker $MGDIR/$fastaXname.fasta $MGDIR/$fastaXname.idx +$BINDIR/indexmaker $MGDIR/$fastaYname.fasta $MGDIR/$fastaYname.idx + +$BINDIR/reverseComplement $MGDIR/$fastaYname.fasta $MGDIR/$fastaYname.rev +$BINDIR/indexmaker $MGDIR/$fastaYname.rev $MGDIR/$fastaYname.rev.idx + +# extract frags + +$BINDIR/frags2text $MGDIR/$fragsName.frags $MGDIR/$fastaXname.fasta $MGDIR/$fastaYname.fasta $MGDIR/$fastaYname.rev $MGDIR/$fastaXname.idx $MGDIR/$fastaYname.idx $MGDIR/$fastaYname.rev.idx $MGDIR/$alignName.txt + +rm $MGDIR/$fastaXname.idx +rm $MGDIR/$fastaYname.idx +rm $MGDIR/$fastaYname.rev +rm $MGDIR/$fastaYname.rev.idx + +mv $MGDIR/$alignName.txt $ALIGN + diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/frags2text Binary file gecko/bin/frags2text has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/getInfo Binary file gecko/bin/getInfo has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/hdStat Binary file gecko/bin/hdStat has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/hits Binary file gecko/bin/hits has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/hitsStat Binary file gecko/bin/hitsStat has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/indexmaker Binary file gecko/bin/indexmaker has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/matrix.mat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/bin/matrix.mat Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,6 @@ +ACGT +ACGT +4 -4 -4 -4 +-4 4 -4 -4 +-4 -4 4 -4 +-4 -4 -4 4 diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/reverseComplement Binary file gecko/bin/reverseComplement has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/sortHits Binary file gecko/bin/sortHits has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/sortWords Binary file gecko/bin/sortWords has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/w2hd Binary file gecko/bin/w2hd has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/words Binary file gecko/bin/words has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/wordsStat Binary file gecko/bin/wordsStat has changed diff -r 000000000000 -r 9db88f0f32b7 gecko/bin/workflow.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/bin/workflow.sh Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,165 @@ +#!/bin/bash + +FL=1000 # frequency limit + +if [ $# != 8 ]; then + echo " ==== ERROR ... you called this script inappropriately." + echo "" + echo " usage: $0 seqXName seqYName lenght similarity WL fixedL output.frags output.csv" + echo "" + exit -1 +fi + +#{ + + +MYRAND=$((( RANDOM % 10000000) +1)) +MGDIR=${PWD}/${MYRAND} +echo "MGDIR $MGDIR" +mkdir -p ${MGDIR} + +genoXname=$(basename "$1") +genoYname=$(basename "$2") +genoXname="${genoXname%.*}" +genoYname="${genoYname%.*}" + + +cp $1 $MGDIR/${genoXname}.fasta +cp $2 $MGDIR/${genoYname}.fasta +mkdir -p ${MGDIR}/dictionaries +mkdir -p ${MGDIR}/fragments + +genoXExt="fasta" +genoYExt="fasta" + + + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +length=${3} +similarity=${4} +WL=${5} # wordSize +fixedL=${6} +output=${7} +csv=${8} + +mkdir ${MGDIR}/intermediateFiles + +mkdir ${MGDIR}/intermediateFiles/${genoXname}-${genoYname} +mkdir ${MGDIR}/results +mkdir ${MGDIR}/intermediateFiles/dictionaries +mkdir ${MGDIR}/intermediateFiles/hits + +# Copiamos los fastas +ln -s ${MGDIR}/${genoXname}.${genoXname} ${MGDIR}/intermediateFiles/${genoXname}-${genoYname} +ln -s ${MGDIR}/${genoYname}.${genoYname} ${MGDIR}/intermediateFiles/${genoYname}-${genoXname} + +cd ${MGDIR}/intermediateFiles/${genoXname}-${genoYname} + +############### + + + +echo "${BINDIR}/reverseComplement ${MGDIR}/${genoYname}.${genoXExt} ${genoYname}-revercomp.${genoYExt}" +${BINDIR}/reverseComplement ${MGDIR}/${genoYname}.${genoYExt} ${MGDIR}/${genoYname}-revercomp.${genoYExt} + +if [[ ! -f ../dictionaries/${genoXname}.d2hP ]]; then + echo "${BINDIR}/dictionary.sh ${MGDIR}/${genoXname}.${genoXExt} &" + ${BINDIR}/dictionary.sh ${MGDIR}/${genoXname}.${genoXExt} & +fi + +if [[ ! -f ../dictionaries/${seqYName}.d2hP ]]; then + echo "${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}.${genoYExt} &" + ${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}.${genoYExt} & +fi + +if [[ ! -f ../dictionaries/${genoYname}-revercomp.d2hP ]]; then + echo "${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}-revercomp.${genoYExt} &" + ${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}-revercomp.${genoYExt} & +fi + +echo "Waiting for the calculation of the dictionaries" + +for job in `jobs -p` +do + #echo $job + wait $job +done + + + +mv ${genoXname}.d2hP ../dictionaries/ +mv ${genoXname}.d2hW ../dictionaries/ +mv ${genoYname}.d2hP ../dictionaries/ +mv ${genoYname}.d2hW ../dictionaries/ +mv ${genoYname}-revercomp.d2hP ../dictionaries/ +mv ${genoYname}-revercomp.d2hW ../dictionaries/ + + + +# Hacemos enlace simbolico +ln -s ../dictionaries/${genoXname}.d2hP . +ln -s ../dictionaries/${genoXname}.d2hW . + +ln -s ../dictionaries/${genoYname}.d2hP . +ln -s ../dictionaries/${genoYname}.d2hW . + +ln -s ../dictionaries/${genoYname}-revercomp.d2hP . +ln -s ../dictionaries/${genoYname}-revercomp.d2hW . + +echo "${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} f &" +${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} f & + +echo "${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}-revercomp.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} r &" +${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}-revercomp.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} r & + +echo "Waiting for the comparisons" + +for job in `jobs -p` +do + #echo $job + wait $job +done + + +#echo "rm ${seqYName}-revercomp.${extensionY}" +#rm ${seqYName}-revercomp.${extensionY} + + +echo "${BINDIR}/combineFrags ${genoXname}-${genoYname}-sf.frags ${genoXname}-${genoYname}-revercomp-sr.frags ${genoXname}-${genoYname}.frags" +${BINDIR}/combineFrags ${genoXname}-${genoYname}-sf.frags ${genoXname}-${genoYname}-revercomp-sr.frags ${genoXname}-${genoYname}.frags + +#Borramos todo menos los frags y los diccionarios + +# Get Info from frags +echo "${BINDIR}/getInfo ${genoXname}-${genoYname}.frags > ${genoXname}-${genoYname}.csv" +${BINDIR}/getInfo ${genoXname}-${genoYname}.frags > ${genoXname}-${genoYname}.csv.tmp +cat ${genoXname}-${genoYname}.frags.INF ${genoXname}-${genoYname}.csv.tmp > ${genoXname}-${genoYname}.csv +rm -rf ${genoXname}-${genoYname}.csv.tmp + +if [[ -L "../../${genoXname}.fasta" ]] +then + rm ../../${genoYname}.fasta +fi + +if [[ -L "../../${genoXname}.fasta" ]] +then + rm ../../${genoYname}.fasta +fi + +#Movemos los frags y los info +mv ${genoXname}-${genoYname}.frags $output +mv ${genoXname}-${genoYname}.csv $csv + + + +#echo "Borrando ${seqXName}-${seqYName}" +cd .. + +#rm -rf ${seqXName}-${seqYName} + +cd .. + + +rm -r ${MGDIR} + diff -r 000000000000 -r 9db88f0f32b7 gecko/frags2align.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/frags2align.xml Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,13 @@ + + Extract alignments from a binary Frags file + + + + + + /home/galaxy-bitlab/galaxy/tools/gecko/bin/frags2align.sh $fragsFile $fastaX $fastaY $alignments + + + + + diff -r 000000000000 -r 9db88f0f32b7 gecko/gecko.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/gecko.xml Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,44 @@ + + A pairwise genome comparison software for the detection of High-scoring Segment Pairs + + /home/galaxy-bitlab/galaxy/tools/gecko/bin/workflow.sh $genome1 $genome2 $length $similarity $wl 1 $frags_output0 $csv_output1 + + + + + + + + + + + + + + + + +GECKO (GEnome Comparison with K-mers Out-of-core) is a fast, modular application designed to identify collections of HSPs in a pairwise genome comparisons. By employing novel filtering and data storing strategies, it is able to compare genome-sized sequences in less time. + +----- + +**Manual** + +To use GECKO, simply upload two .fasta files and select these as Sequence X and as Sequence Y. +Once so, choose the parameters that suit best your comparison: + + +- Minimum length: This parameter is the minimum length in nucleotides for an HSP (similarity fragment) to be conserved. Any HSP below this length will be filtered out of the comparison. It is recommended to use around 40 bp for small organisms (e.g. bacterial mycoplasma or E. Coli) and around 100 bp or more for larger organisms (e.g. human chromosomes). + +- Minimum similarity: This parameter is analogous to the minimum length, however, instead of length, the similarity is used as threshold. The similarity is calculated as the score attained by an HSP divided by the maximum possible score. Use values above 50 to filter noise. + +- Word length: This parameter is the seed size used to find HSPs. A smaller seed size will increase sensitivity and decrease performance, whereas a larger seed size will decrease sensitivity and increase performance. Recommended values are 12 or 16 for smaller organisms and 32 for larger organisms. These values must be multiples of 4. + + + + + 10.1186/s12859-015-0679-9 + + + + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/FragHits.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/FragHits.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,327 @@ +/* FragHITS Fragments detection betweem two bioogical seqeunces + + Search for all By-Identitity ungapped fragments with length > MinLen + + This program is an enhancement of AllFrags (that searches the full space) + In this case, start-Fragments (HITS) has been detected in previous steps and + this hits are used as seeds to extend the fragment. A backward step is also + included to extend the fragment in the second opposite direction from the hit + + Syntax: + + AllFragsHits SeqX.file SeqY.file HitsFile Out.file Lmin SimThr + + Some new parameters have been included: + - HitsFile (binary) long Diag/posX/posY + - Lmin Minimum length in the case of fixed length and percentage otherwise + - SimThr Similarity (identity) Threshold + + - SeqX & SeqY files are in fasta format + - Out file (binary) will save a fragment with the format specified in structs.h. + + oscart@uma.es + -------------------------------------------*/ + +#include +#include +#include +#include "commonFunctions.h" +#include "structs.h" +#include "comparisonFunctions.h" + +#define POINT 4 + +int HitsControl(char **av); +int FragFromHit(long M[1000][100], struct FragFile *myF, hit *H, struct Sequence *sX, + uint64_t n0, struct Sequence *sY, + uint64_t n1, uint64_t nSeqs1, uint64_t LenOrPer, uint64_t SimTh, int WL, + int fixedL, char strand); + +int main(int ac, char **av) { + + if (ac != 10) + terror( + "usage: FragsHits SeqX SeqY Hits.File Outfile Lmin SimThr WL fixedLen(1/0) strand(f/r)"); + + if (av[9][0] != 'f' && av[9][0] != 'r') + terror("strand argument must be 'f' or 'r'"); + + HitsControl(av); + + return 0; +} + +int HitsControl(char **av) { + struct Sequence *sX, *sY; + uint64_t n0, n1, nSeqs0, nSeqs1; + uint64_t Lmin, SimTh; + int WL, fixedL, i, j; + int newFrag; + uint64_t nFrags = 0, nHitsUsed = 0, nHits = 0; + int64_t lastOffset, lastDiagonal; + struct FragFile myF; + char infoFileName[200], matFileName[200]; + char strand; + FILE *f, *fH, *fOut, *fInf, *fMat; + + hit h; + + //MAT + long coverage[1000][100]; + for(i=0; i<1000; i++){ + for(j=0; j<100; j++){ + coverage[i][j]=0; + } + } + //--- + + //Initialize values + Lmin = atoi(av[5]); + SimTh = atoi(av[6]); + WL = atoi(av[7]); + fixedL = atoi(av[8]); + strand = av[9][0]; + + //Open files + if ((f = fopen(av[1], "rt")) == NULL) + terror("opening seqX file"); + sX = LeeSeqDB(f, &n0, &nSeqs0, 0); + fclose(f); + + if ((f = fopen(av[2], "rt")) == NULL) + terror("opening seqY file"); + sY = LeeSeqDB(f, &n1, &nSeqs1, 0); + fclose(f); + + if ((fH = fopen(av[3], "rb")) == NULL) + terror("opening HITS file"); + + if ((fOut = fopen(av[4], "wb")) == NULL) + terror("opening output file"); + + //Write sequence lengths + writeSequenceLength(&n0, fOut); + writeSequenceLength(&n1, fOut); + + // read Hits + if(fread(&h, sizeof(hit), 1, fH)!=1){ + terror("Empty hits file"); + } + lastDiagonal = h.diag; + lastOffset = h.posX - 1; + + while (!feof(fH)) { + nHits++; + if (lastDiagonal != h.diag) { + //Different diagonal update the variables + lastDiagonal = h.diag; + lastOffset = h.posX - 1; + } + //We have a casting here because of a funny error + //where the program was saying that -1 > 0 + if (lastOffset > ((int64_t) h.posX)) { + //The hit is covered by a previous frag in the same diagonal + if(fread(&h, sizeof(hit), 1, fH)!=1){ + if(ferror(fH))terror("Error reading from hits file"); + } + continue; + } + + nHitsUsed++; + newFrag = FragFromHit(coverage, &myF, &h, sX, n0, sY, n1, nSeqs1, Lmin, SimTh, + WL, fixedL, strand); + if (newFrag) { + writeFragment(&myF, fOut); + lastOffset = h.posX + myF.length; + nFrags++; + } + if(fread(&h, sizeof(hit), 1, fH)!=1){ + if(ferror(fH))terror("Error reading from hits file"); + } + } + + fclose(fH); + fclose(fOut); + + sprintf(matFileName, "%s.MAT", av[4]); + if ((fMat = fopen(matFileName, "wt")) == NULL) + terror("opening MAT file"); + + for(i=0; i<1000; i++){ + for(j=0; j<100; j++){ + fprintf(fMat, "%ld\t", coverage[i][j]); + } + fprintf(fMat, "\n"); + } + + fclose(fMat); + + // metadata info + sprintf(infoFileName, "%s.INF", av[4]); + if ((fInf = fopen(infoFileName, "wt")) == NULL) + terror("opening INFO file"); + + fprintf(fInf, "All by-Identity Ungapped Fragments (Hits based approach)\n"); + fprintf(fInf, "[Abr.98/Apr.2010/Dec.2011 -- \n"); + fprintf(fInf, "SeqX filename : %s\n", av[1]); + fprintf(fInf, "SeqY filename : %s\n", av[2]); + fprintf(fInf, "SeqX name : %s\n", sX->ident); + fprintf(fInf, "SeqY name : %s\n", sY->ident); + fprintf(fInf, "SeqX length : %" PRIu64 "\n", n0); + fprintf(fInf, "SeqY length : %" PRIu64 "\n", n1); + fprintf(fInf, "Min.fragment.length : %" PRIu64 "\n", Lmin); + fprintf(fInf, "Min.Identity : %" PRIu64 "\n", SimTh); + fprintf(fInf, "Tot Hits (seeds) : %" PRIu64 "\n", nHits); + fprintf(fInf, "Tot Hits (seeds) used: %" PRIu64 "\n", nHitsUsed); + fprintf(fInf, "Total fragments : %" PRIu64 "\n", nFrags); + fprintf(fInf, "========================================================\n"); + fclose(fInf); + + return nHitsUsed; +} + +/** + * Compute a fragments from one seed point + * Similarirty thershold and length > mimL + */ +int FragFromHit(long M[1000][100], struct FragFile *myF, hit *H, struct Sequence *sX, + uint64_t n0, struct Sequence *sY, + uint64_t n1, uint64_t nSeqs1, uint64_t Lm, uint64_t SimTh, int WL, + int fixedL, char strand) { + int64_t ldiag, ldiag2; + int64_t xfil, ycol; + /* for version with backward search */ + int64_t xfil2, ycol2; + int fragmentLength = 0; + /* for version Maximum global---*/ + int64_t xfilmax, ycolmax; + /* for version with backward search */ + int64_t xfilmax2, ycolmax2; + int nIdentities = 0, maxIdentities = 0; + char valueX, valueY; + int fscore = 0, fscoreMax = 0; // full score + + uint64_t minLength = + (fixedL) ? + Lm : + (uint64_t) (min(getSeqLength(sX), + getSeqLength(sY)) * (Lm / 100.0)); + + // Initialize values + ldiag = min(n0 - H->posX, n1 - H->posY); + //var to know how much we have until we reach the origin of coordinates + ldiag2 = min(H->posX, H->posY); + xfil = H->posX + WL; + xfil2 = H->posX - 1; + ycol = H->posY + WL; + ycol2 = H->posY - 1; + fragmentLength += WL; + xfilmax = xfil; + xfilmax2 = xfil2; + ycolmax = ycol; + ycolmax2 = ycol2; + nIdentities = maxIdentities = WL; + fscore = POINT * WL; + fscoreMax = fscore; + + // now, looking for end_frag--- + while (fragmentLength < ldiag) { + valueX = getValue(sX, xfil); + valueY = getValue(sY, ycol); + if (valueX == '*' || valueY == '*') { + //separator between sequences + break; + } + + if (valueX == 'N' || valueY == 'N') { + fscore -= 1; + } else { + if (valueX == valueY) { + // match + fscore += POINT; + nIdentities++; + if (fscoreMax < fscore) { + fscoreMax = fscore; + xfilmax = xfil; + ycolmax = ycol; + maxIdentities = nIdentities; + } + } else { + fscore -= POINT; + } + } + + xfil++; + ycol++; + fragmentLength++; + if (fscore < 0) + break; + } + + /** + * Backward search --- Oscar (Sept.2013) + **/ + fragmentLength = 0; + fscore = fscoreMax; + xfilmax2 = H->posX; + ycolmax2 = H->posY; + nIdentities = maxIdentities; + if (xfil2 >= 0 && ycol2 >= 0) + while (fragmentLength < ldiag2) { + valueX = getValue(sX, xfil2); + valueY = getValue(sY, ycol2); + if (valueX == '*' || valueY == '*') { + //separator between sequences + break; + } + + if (valueX == 'N' || valueY == 'N') { + fscore -= 1; + } else { + if (valueX == valueY) { + // matches---- + fscore += POINT; + nIdentities++; + if (fscoreMax < fscore) { + fscoreMax = fscore; + xfilmax2 = xfil2; + ycolmax2 = ycol2; + maxIdentities = nIdentities; + } + } else { + fscore -= POINT; + } + } + + xfil2--; + ycol2--; + fragmentLength++; + if (fscore < 0) + break; + } + + // Set the values of the FragFile + myF->diag = H->diag; + myF->xStart = (uint64_t) xfilmax2 - H->seqX; + myF->yStart = (uint64_t) ycolmax2 - H->seqY; + myF->xEnd = (uint64_t) xfilmax - H->seqX; + myF->yEnd = (uint64_t) ycolmax - H->seqY;; + myF->length = myF->xEnd - myF->xStart + 1; + myF->score = fscoreMax; + myF->ident = maxIdentities; + myF->similarity = myF->score * 100.0 + / scoreMax(&sX->datos[myF->xStart], &sY->datos[myF->yStart], + myF->length, POINT); + myF->seqX = H->seqX; + myF->seqY = (strand=='f')? H->seqY : nSeqs1 - H->seqY - 1; + myF->block = 0; + myF->strand = strand; + + M[min(myF->length, 999)][(int)myF->similarity]++; + + if (myF->length > minLength && myF->similarity > SimTh) + return 1; + else + return 0; +} + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/Makefile Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,65 @@ +CC=gcc +CXX=g++ +CFLAGS=-O3 -D_FILE_OFFSET_BITS=64 -Wall +BIN=../bin + +all: reverseComplement words sortWords w2hd wordsStat hits sortHits filterHits FragHits combineFrags hitsStat hdStat fragStat getInfoCSB filterFrags indexmaker frags2text + +hits: hits.c + $(CC) $(CFLAGS) hits.c comparisonFunctions.c dictionaryFunctions.c commonFunctions.c -o $(BIN)/hits + +sortHits: sortHits.c + $(CC) $(CFLAGS) -DBaseType=hit sortHits.c comparisonFunctions.c commonFunctions.c quicksort.c -lpthread -o $(BIN)/sortHits + +filterHits: filterHits.c + $(CC) $(CFLAGS) filterHits.c comparisonFunctions.c commonFunctions.c -o $(BIN)/filterHits + +FragHits: FragHits.c + $(CC) $(CFLAGS) FragHits.c comparisonFunctions.c commonFunctions.c -o $(BIN)/FragHits + +combineFrags: combineFrags.c + $(CC) $(CFLAGS) combineFrags.c comparisonFunctions.c commonFunctions.c -o $(BIN)/combineFrags + +hitsStat: hitsStat.c + $(CC) $(CFLAGS) hitsStat.c commonFunctions.c -o $(BIN)/hitsStat + +hdStat: hdStat.c + $(CC) $(CFLAGS) hdStat.c commonFunctions.c dictionaryFunctions.c -o $(BIN)/hdStat + +fragStat: fragStat.c + $(CC) $(CFLAGS) fragStat.c comparisonFunctions.c commonFunctions.c -o $(BIN)/fragStat + +reverseComplement: reverseComplement.c + $(CC) $(CFLAGS) reverseComplement.c commonFunctions.c -o $(BIN)/reverseComplement + +words: words.c + $(CC) $(CFLAGS) words.c dictionaryFunctions.c commonFunctions.c -o $(BIN)/words + +sortWords: sortWords.c + $(CC) $(CFLAGS) -DBaseType=wentry sortWords.c dictionaryFunctions.c commonFunctions.c quicksort.c -lpthread -o $(BIN)/sortWords + +w2hd: w2hd.c + $(CC) $(CFLAGS) w2hd.c dictionaryFunctions.c commonFunctions.c -o $(BIN)/w2hd + +wordsStat: wordsStat.c + $(CC) $(CFLAGS) wordsStat.c dictionaryFunctions.c commonFunctions.c -o $(BIN)/wordsStat + +getInfoCSB: + $(CC) $(CFLAGS) getInfoCSB.c fragmentv2.c commonFunctions.c comparisonFunctions.c -lm -o $(BIN)/getInfo + +filterFrags: + $(CC) $(CFLAGS) filterFrags.c fragmentv2.c commonFunctions.c comparisonFunctions.c -lm -o $(BIN)/filterFrags + +indexmaker: + $(CC) $(CFLAGS) indexmaker.c fragmentv2.c commonFunctions.c comparisonFunctions.c -lm -o $(BIN)/indexmaker + +frags2text: + $(CC) $(CFLAGS) frags2text.c fragmentv2.c commonFunctions.c comparisonFunctions.c -lm -o $(BIN)/frags2text + +clean: + rm -rf $(BIN)/getInfo + rm -rf $(BIN)/filterFrags + rm -rf $(BIN)/indexmaker + rm -rf $(BIN)/frags2text + rm -rf $(BIN)/reverseComplement $(BIN)/words $(BIN)/sortWords $(BIN)/w2hd $(BIN)/wordsStat + rm -rf $(BIN)/hits $(BIN)/sortHits $(BIN)/filterHits $(BIN)/FragHits $(BIN)/combineFrags $(BIN)/hitsStat $(BIN)/fragStat $(BIN)/hdStat diff -r 000000000000 -r 9db88f0f32b7 gecko/src/combineFrags.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/combineFrags.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,182 @@ +/* reduce 2 fragments files produced in parallel + + Syntax: reduceFrags PrefixFRAGSfiles nChunks fileOUT + + --------------------------------------------------------*/ +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "comparisonFunctions.h" + +void writeINFHeader(FILE *fIn, FILE *fOut); +void getNumberOfHits(FILE *infFile, uint64_t *nHits, uint64_t *nHitsUsed, uint64_t *nFrags); + +int main(int ac, char** av) { + uint64_t n1, n2, nHitsForward, nHitsForwardUsed, nHitsReverse, nHitsReverseUsed, nFragsForward, nFragsReverse; + char infoFileName[200], matFileName[200]; + FILE *fIn, *fInfIn1, *fInfIn2, *fInf, *fOut, *fMat; + + struct FragFile frag; + + long coverage[1000][100], valor, i, j; + for(i=0; i<1000; i++){ + for(j=0; j<100; j++){ + coverage[i][j]=0; + } + } + + if (ac != 4) + terror("Syntax: combineFrags forwardStrandFrags reverseStrandFrags fileOUT"); + + if ((fIn = fopen(av[1], "rb")) == NULL) + terror("Input forwardStrandFrags open error"); + + sprintf(infoFileName, "%s.INF", av[1]); + if ((fInfIn1 = fopen(infoFileName, "rt")) == NULL) + terror("Input forwardStrandFrags.INF open error"); + + //MAT + sprintf(matFileName, "%s.MAT", av[1]); + if ((fMat = fopen(matFileName, "rt")) == NULL) + terror("opening forwardStrandFrags.MAT file"); + for (i=0;i<1000;i++){ + for (j=0;j<100;j++){ + if(fscanf(fMat,"%ld\t",&valor)!=1){ + terror("Error reading forwardStrandFrags.MAT file"); + } + coverage[i][j]+=valor; + } + } + + fclose(fMat); + + if ((fOut = fopen(av[3], "wb")) == NULL) + terror("OUTput file open error"); + + // sequences lengths + readSequenceLength(&n1, fIn); + readSequenceLength(&n2, fIn); + + writeSequenceLength(&n1, fOut); + writeSequenceLength(&n2, fOut); + + //First file... + readFragment(&frag, fIn); + + while (!feof(fIn)) { + writeFragment(&frag, fOut); + readFragment(&frag, fIn); + } + + fclose(fIn); + + if ((fIn = fopen(av[2], "rb")) == NULL) + terror("Input reverseStrandFrags open error"); + + sprintf(infoFileName, "%s.INF", av[2]); + if ((fInfIn2 = fopen(infoFileName, "rt")) == NULL) + terror("Input reverseStrandFrags.INF open error"); + + //MAT + sprintf(matFileName, "%s.MAT", av[2]); + if ((fMat = fopen(matFileName, "rt")) == NULL) + terror("opening reverseStrandFrags.MAT file"); + for (i=0;i<1000;i++){ + for (j=0;j<100;j++){ + if(fscanf(fMat,"%ld\t",&valor)!=1) + terror("Error reading reverseStrandFrags.MAT file"); + coverage[i][j]+=valor; + } + } + + fclose(fMat); + + readSequenceLength(&n1, fIn); + readSequenceLength(&n2, fIn); + + //Second file... + readFragment(&frag, fIn); + + while (!feof(fIn)) { + writeFragment(&frag, fOut); + readFragment(&frag, fIn); + } + + //MAT + sprintf(matFileName, "%s.MAT", av[3]); + if ((fMat = fopen(matFileName, "wt")) == NULL) + terror("opening MAT file"); + for (i=0;i<1000;i++){ + for (j=0;j<100;j++){ + fprintf(fMat,"%ld\t",coverage[i][j]); + } + fprintf(fMat, "\n"); + } + + fclose(fMat); + + // metadata info + sprintf(infoFileName, "%s.INF", av[3]); + if ((fInf = fopen(infoFileName, "wt")) == NULL) + terror("opening INFO file"); + + writeINFHeader(fInfIn1, fInf); + if(fseek(fInfIn1, 0, SEEK_SET) != 0) + terror("Error rewinding inf file"); + + getNumberOfHits(fInfIn1, &nHitsForward, &nHitsForwardUsed, &nFragsForward); + getNumberOfHits(fInfIn2, &nHitsReverse, &nHitsReverseUsed, &nFragsReverse); + + fprintf(fInf, "Tot Hits (seeds) : %" PRIu64 "\n", nHitsForward + nHitsReverse); + fprintf(fInf, "Tot Hits (seeds) used: %" PRIu64 "\n", nHitsForwardUsed + nHitsReverseUsed); + fprintf(fInf, "Total fragments : %" PRIu64 "\n", nFragsForward + nFragsReverse); + fprintf(fInf, "========================================================\n"); + fclose(fInf); + + fclose(fIn); + fclose(fInfIn1); + fclose(fInfIn2); + + fclose(fOut); + + return 0; + +} + +void writeINFHeader(FILE *fIn, FILE *fOut){ + char tmp[1024]; + int i=0; + while((i < 10) && (fgets(tmp, 1024, fIn)!=NULL)){ + fprintf(fOut, "%s", tmp); + i++; + } +} + +void getNumberOfHits(FILE *infFile, uint64_t *nHits, uint64_t *nHitsUsed, uint64_t *nFrags){ + char tmp[1024]; + int i=0; + while((i < 10) && (fgets(tmp, 1024, infFile)!=NULL)){ + i++; + } + + if(fgets(tmp, 1024, infFile)==NULL) + terror("Wrong format in INF file, Tot Hits (seeds) line expected"); + + if(sscanf(tmp, "Tot Hits (seeds) : %" PRIu64 "\n", nHits)!=1) + terror("Error reading: Tot Hits (seeds)"); + + if(fgets(tmp, 1024, infFile)==NULL) + terror("Wrong format in INF file, Tot Hits (seeds) used line expected"); + + if(sscanf(tmp, "Tot Hits (seeds) used: %" PRIu64 "\n", nHitsUsed)!=1) + terror("Error reading: Tot Hits (seeds) used"); + + if(fgets(tmp, 1024, infFile)==NULL) + terror("Wrong format in INF file, Total fragments line expected"); + + if(sscanf(tmp, "Total fragments : %" PRIu64 "\n", nFrags)!=1) + terror("Error reading: Total fragments"); +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/commonFunctions.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/commonFunctions.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,20 @@ +#include +#include +#include +#include +#include "structs.h" + +void terror(char *s) { + printf("ERR**** %s ****\n", s); + exit(-1); +} + +char buffered_fgetc(char *buffer, uint64_t *pos, uint64_t *read, FILE *f) { + if (*pos >= READBUF) { + *pos = 0; + memset(buffer, 0, READBUF); + *read = fread(buffer, 1, READBUF, f); + } + *pos = *pos + 1; + return buffer[*pos-1]; +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/commonFunctions.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/commonFunctions.h Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,14 @@ +#ifndef COMMON_FUNCTIONS_H +#define COMMON_FUNCTIONS_H + +/** + * Print the error message 's' and exit(-1) + */ +void terror(char *s); + +/** + * Function to buffer file reading + */ +char buffered_fgetc(char *buffer, uint64_t *pos, uint64_t *read, FILE *f); + +#endif /* COMMON_FUNCTIONS_H */ diff -r 000000000000 -r 9db88f0f32b7 gecko/src/comparisonFunctions.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/comparisonFunctions.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,430 @@ +#include +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" + +#define MAXBUF 10000000 + +int readHashEntry(hashentry *h, FILE *f, uint64_t freqThr) { + do { + if(fread(h, sizeof(hashentry), 1, f)!=1){ + if(ferror(f))terror("Error reading hash entry from the file"); + } + } while (!feof(f) && h->num > freqThr); + + if (feof(f)) + return -1; + + return h->num; +} + +void loadWordOcurrences(hashentry he, location** pos, FILE** f) { + // Load word position for he.word + if (he.num > MAXBUF) { + *pos = (location*) realloc(pos, he.num * sizeof(location)); + if (pos == NULL) + terror("memory for position array"); + } + fseek(*f, he.pos, SEEK_SET); + if(fread(*pos, sizeof(location), he.num, *f)!=he.num){ + terror("Not possible to read the number of elements described"); + } +} + +unsigned long scoreMax(char *seq, char *seq2, uint64_t len, int point) { + //CHANGE WHEN USING PAM MATRIX + //Scor=0; for (i=0;iidatos = calloc(SIZE, sizeof(char))) == NULL) { + terror("Memory for sequence..."); + } + + if ((seq = calloc(READBUF, sizeof(char))) == NULL) { + terror("Memory for sequence..."); + } + + i = READBUF + 1; + + if (!fAst) + while ((c = buffered_fgetc(seq, &i, &r, f)) != '>' && (!feof(f) || (feof(f) && i < r ) )); //start seq + if (feof(f) && i >= r) + return 0; + while ((c = buffered_fgetc(seq, &i, &r, f)) == ' '); + + while (k < MAXLID && c != '\n' && c != ' ') { + if (feof(f) && i >= r) + return 0; + + sX->ident[k++] = c; + c = buffered_fgetc(seq, &i, &r, f); + } + + sX->ident[k] = 0; //end of data. + while (c != '\n') + c = buffered_fgetc(seq, &i, &r, f); + c = buffered_fgetc(seq, &i, &r, f); + + //start list with sX2 + while (!feof(f) || (feof(f) && i < r ) ) { + c = toupper(c); + if (c == '>') { + fAst = 1; + seqs++; + sX->datos[lon++] = '*'; + while (c != '\n') { + if ((feof(f) && i >= r )) + return 0; + c = buffered_fgetc(seq, &i, &r, f); + } + //break; + } + if (isupper(c)) + sX->datos[lon++] = c; + if (c == '*') { + sX->datos[lon++] = c; + } + c = buffered_fgetc(seq, &i, &r, f); + } + + free(seq); + + sX->datos[lon] = 0x00; + + lonFinal += lon; + *nSeqs = seqs + 1; + *n = lonFinal - seqs; + return sX; +} + +char getValue(struct Sequence *s, uint64_t pos) { + return s->datos[pos]; +} + +long getSeqLength(struct Sequence *s) { + uint64_t s1 = 0; + while (s1 > 0 && s->datos[s1] != '*') { + s1--; + } + s1++; + char *tmp = strchr(s->datos + s1, '*'); + if (tmp == NULL) { + return strlen(s->datos) - s1 + 1; + } + return tmp - (s->datos + s1) + 1; +} + +void endianessConversion(char *source, char *target, int numberOfBytes){ + int i,j; + for(i=numberOfBytes-1;i>=0;i--){ + j=numberOfBytes-1-i; + target[j]=source[i]; + } +} + + +/** + * Function to read a fragment from the specified file + */ +void readFragment(struct FragFile *frag, FILE *f){ + char tmpArray[8]; + + if(htons(1)==1){ + //big endian + if(fread(&frag->diag, sizeof(int64_t), 1, f)!=1){ + if(feof(f))return; + terror("Error reading the HSP diagonal"); + } + if(fread(&frag->xStart, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP xStart"); + } + if(fread(&frag->yStart, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP yStart"); + } + if(fread(&frag->xEnd, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP xEnd"); + } + if(fread(&frag->yEnd, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP yEnd"); + } + if(fread(&frag->length, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP length"); + } + if(fread(&frag->ident, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP identities"); + } + if(fread(&frag->score, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP score"); + } + if(fread(&frag->similarity, sizeof(float), 1, f)!=1){ + terror("Error reading the HSP similarity"); + } + if(fread(&frag->seqX, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP seqX"); + } + if(fread(&frag->seqY, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP seqY"); + } + if(fread(&frag->block, sizeof(int64_t), 1, f)!=1){ + terror("Error reading the HSP block"); + } + frag->strand = fgetc(f); + } else { + //little endian + if(fread(tmpArray, sizeof(int64_t), 1, f)!=1){ + if(feof(f))return; + terror("Error reading the HSP diagonal"); + } + endianessConversion(tmpArray, (char *)(&frag->diag), sizeof(int64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP xStart"); + } + endianessConversion(tmpArray, (char *)(&frag->xStart), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP yStart"); + } + endianessConversion(tmpArray, (char *)(&frag->yStart), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP xEnd"); + } + endianessConversion(tmpArray, (char *)(&frag->xEnd), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP yEnd"); + } + endianessConversion(tmpArray, (char *)(&frag->yEnd), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP length"); + } + endianessConversion(tmpArray, (char *)(&frag->length), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP identity"); + } + endianessConversion(tmpArray, (char *)(&frag->ident), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP score"); + } + endianessConversion(tmpArray, (char *)(&frag->score), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(float), 1, f)!=1){ + terror("Error reading the HSP float"); + } + endianessConversion(tmpArray, (char *)(&frag->similarity), sizeof(float)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP seqX"); + } + endianessConversion(tmpArray, (char *)(&frag->seqX), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading the HSP seqY"); + } + endianessConversion(tmpArray, (char *)(&frag->seqY), sizeof(uint64_t)); + if(fread(tmpArray, sizeof(int64_t), 1, f)!=1){ + terror("Error reading the HSP block"); + } + endianessConversion(tmpArray, (char *)(&frag->block), sizeof(int64_t)); + frag->strand = fgetc(f); + } +} + +/** + * Function to write a fragment to the specified file + */ +void writeFragment(struct FragFile *frag, FILE *f){ + char tmpArray[8]; + if(htons(1)==1){ + //Big endian + fwrite(&frag->diag, sizeof(int64_t), 1, f); + fwrite(&frag->xStart, sizeof(uint64_t), 1, f); + fwrite(&frag->yStart, sizeof(uint64_t), 1, f); + fwrite(&frag->xEnd, sizeof(uint64_t), 1, f); + fwrite(&frag->yEnd, sizeof(uint64_t), 1, f); + fwrite(&frag->length, sizeof(uint64_t), 1, f); + fwrite(&frag->ident, sizeof(uint64_t), 1, f); + fwrite(&frag->score, sizeof(uint64_t), 1, f); + fwrite(&frag->similarity, sizeof(float), 1, f); + fwrite(&frag->seqX, sizeof(uint64_t), 1, f); + fwrite(&frag->seqY, sizeof(uint64_t), 1, f); + fwrite(&frag->block, sizeof(int64_t), 1, f); + fputc(frag->strand, f); + } else { + //Little endian + endianessConversion((char *)(&frag->diag), tmpArray, sizeof(int64_t)); + fwrite(tmpArray, sizeof(int64_t), 1, f); + endianessConversion((char *)(&frag->xStart), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->yStart), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->xEnd), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->yEnd), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->length), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->ident), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->score), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->similarity), tmpArray, sizeof(float)); + fwrite(tmpArray, sizeof(float), 1, f); + endianessConversion((char *)(&frag->seqX), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->seqY), tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + endianessConversion((char *)(&frag->block), tmpArray, sizeof(int64_t)); + fwrite(tmpArray, sizeof(int64_t), 1, f); + fputc(frag->strand, f); + } +} + +/** + * Function to read the sequence length + */ +void readSequenceLength(uint64_t *length, FILE *f){ + char tmpArray[8]; + if(htons(1)==1){ + //big endian + if(fread(length, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading sequence length"); + } + } else { + //little endian + if(fread(tmpArray, sizeof(uint64_t), 1, f)!=1){ + terror("Error reading sequence length"); + } + endianessConversion(tmpArray, (char *)length, sizeof(uint64_t)); + } +} + +/** + * Function to write the sequence length + */ +void writeSequenceLength(uint64_t *length, FILE *f){ + char tmpArray[8]; + if(htons(1)==1){ + //big endian + fwrite(length, sizeof(uint64_t), 1, f); + } else { + //little endian + endianessConversion((char *)length, tmpArray, sizeof(uint64_t)); + fwrite(tmpArray, sizeof(uint64_t), 1, f); + } +} + +/** + * Function to return the sizeof a fragment. + * Due to architeture issues the value of sizeof built-in + * function could be different + */ +long int sizeofFragment(){ + return 9*sizeof(uint64_t)+2*sizeof(int64_t)+1*sizeof(float)+1*sizeof(char); +} + +/**************** ARJONA *******************/ +void cpyFrag2(struct FragFile *f, struct FragFile g){ + + f->diag = g.diag; + f->xStart = g.xStart; + f->xEnd = g.xEnd; + f->yStart = g.yStart; + f->yEnd = g.yEnd; + f->length = g.length; + f->score = g.score; + f->similarity = g.similarity; + f->seqX = g.seqX; + f->seqY = g.seqY; + f->ident = g.ident; + f->block = g.block; + f->strand = g.strand; +} + +/******/ +struct FragFile* readFragments(char* s,int* nf,uint64_t *xtotal,uint64_t *ytotal){ +//Fragment* readFragments(char* s,int* nf,int *xtotal,int *ytotal){ + + FILE* fe; + + struct FragFile* fs,f; + int n; + + if((fe=fopen(s,"rb"))==NULL){ + printf("***ERROR Opening input file"); + exit(-1); + } + n=0; + readSequenceLength(xtotal, fe); + readSequenceLength(ytotal, fe); +// long int longFile; +// fseek(fe,0,SEEK_END); +// longFile=ftell(fe); +// n=(int)(longFile-2*sizeof(uint64_t))/sizeof(struct FragFile); +// printf("\n ReadFragments Complete\nnum: %d\n",n); + + // Alternativa +++++++++++++++++ + n=0; + readFragment(&f, fe); + while(!feof(fe)){ + readFragment(&f, fe); + n++; + } +// printf("\n ReadFragments Complete\nnum: %d\n",n); + //+++++++++++++ + rewind(fe); + fs=(struct FragFile*)malloc(sizeof(struct FragFile)*(n)); + if(fs==NULL){printf("****ERROR: Out of memory\n");exit(-1);} + + *nf=n; + readSequenceLength(xtotal, fe); + readSequenceLength(ytotal, fe); + n=0; + readFragment(&f, fe); + while(!feof(fe)){ + + + + if(f.length>0){ + cpyFrag2(&fs[n],f); + } + n++; + readFragment(&f, fe); + //fprintf(stdout,"%d\t%" PRId64 "\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%c" "\t%" PRIu64 "\n",n,fs[n].xStart, fs[n].yStart, fs[n].xEnd, fs[n].yEnd, fs[n].length, fs[n].strand, fs[n].ident); + + } + *nf=n; + + + fclose(fe); + return fs; +} +/************************/ + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/comparisonFunctions.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/comparisonFunctions.h Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,67 @@ +#ifndef COMPARISON_FUNCTIONS_H +#define COMPARISON_FUNCTIONS_H + +#define min(x,y) (((x) < (y)) ? (x) : (y)) + +/** + * Read a new hash entry from 'f' with no more occurences + * than 'freqThr' + */ +int readHashEntry(hashentry *h, FILE *f, uint64_t freqThr); + +/** + * Read the ocurrences of the given hash entry from the + * ocurrences file 'f' and stored them in 'pos' + */ +void loadWordOcurrences(hashentry he, location** pos, FILE** f); + +/** + * Get the maximum score possible between the two sequences + */ +unsigned long scoreMax(char *seq, char *seq2, uint64_t len, int point); + +/** + * Read the sequence from disk and store it in a list of Sequence struct + * n: sequence length + * ns: number of nodes in the list + */ +struct Sequence* LeeSeqDB(FILE *f, uint64_t *n, uint64_t *nSeqs, int fAst); + +/** + * Get the value of the sequence in a given position of the list node ns + */ +char getValue(struct Sequence *s, uint64_t pos); + +/** + * Get the length of the sequence 's' + */ +long getSeqLength(struct Sequence *s); + +/** + * Function to read a fragment from the specified file + */ +void readFragment(struct FragFile *frag, FILE *f); + +/** + * Function to write a fragment to the specified file + */ +void writeFragment(struct FragFile *frag, FILE *f); + +/** + * Function to read the sequence length + */ +void readSequenceLength(uint64_t *length, FILE *f); + +/** + * Function to write the sequence length + */ +void writeSequenceLength(uint64_t *length, FILE *f); + +/** + * Function to return the sizeof a fragment. + * Due to architeture issues the value of sizeof built-in + * function could be different + */ +long int sizeofFragment(); + +#endif /* COMPARISON_FUNCTIONS_H */ diff -r 000000000000 -r 9db88f0f32b7 gecko/src/dictionaryFunctions.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/dictionaryFunctions.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" + +int seq2word(char* buf, int wsize, word* w) { + int i; + int b = 6; + memset(w, 0, sizeof(word)); + + for (i = 0; i < wsize; i++) { + if (buf[i] >= 4) + return -1; + w->b[i / 4] |= buf[i] << b; + b -= 2; + if (b < 0) + b = 6; + } + return 0; +} + +void skipIDLine(FILE *fIn) { + char c; + // first line (skip ID fasta Line) + c = fgetc(fIn); + while (c != '\n') + c = fgetc(fIn); +} + +int letterToIndex(char c) { + // coding (a=0,c=1,g=2,t=3,'>'=4 others=9 ) + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + case '>': + return 4; + default: + return 9; + } +} + +int wordcmp(unsigned char *w1, unsigned char *w2, int n) { + + int i = 0, limit; + + if(n%4 != 0){ + w1[n/4] = w1[n/4] >> (2*(3-((n-1)%4))); + w1[n/4] = w1[n/4] << (2*(3-((n-1)%4))); + w2[n/4] = w2[n/4] >> (2*(3-((n-1)%4))); + w2[n/4] = w2[n/4] << (2*(3-((n-1)%4))); + limit=(n/4)+1; + } else { + limit = n/4; + } + + for (i=0;iw2[i]) return +1; + } + return 0; +} + +void showWord(word* w, char *ws) { + char Alf[] = { 'A', 'C', 'G', 'T' }; + int i; + int wsize = 8; + unsigned char c; + for (i = 0; i < wsize; i++) { + c = w->b[i]; + c = c >> 6; + ws[4*i] = Alf[(int) c]; + c = w->b[i]; + c = c << 2; + c = c >> 6; + ws[4*i+1] = Alf[(int) c]; + c = w->b[i]; + c = c << 4; + c = c >> 6; + ws[4*i+2] = Alf[(int) c]; + c = w->b[i]; + c = c << 6; + c = c >> 6; + ws[4*i+3] = Alf[(int) c]; + } +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/dictionaryFunctions.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/dictionaryFunctions.h Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,34 @@ +#ifndef DICTIONARY_FUNCTIONS_H +#define DICTIONARY_FUNCTIONS_H + +/** + * Compress the word stored in 'buf' using 2 bits per letter + * The result will be at word 'w' + */ +int seq2word(char* buf, int wsize, word* w); + +/** + * Function to skip the identification line of a fasta sequence + */ +void skipIDLine(FILE *fIn); + +/** + * Function to convert the alphabet letter + * to an index. + * coding (a=0,c=1,g=2,t=3,'>'=4 others=9) + */ +int letterToIndex(char c); + +/** + * Function to print in stdout the given compressed word + */ +void showWord(word* w, char *ws); + +/** + * Function to compare two k-mers + * The function returns 0 if equal, 1 if greater + * than and -1 otherwise + */ +int wordcmp(unsigned char *w1, unsigned char*w2, int n); + +#endif /* DICTIONARY_FUNCTIONS_H */ diff -r 000000000000 -r 9db88f0f32b7 gecko/src/filterFrags.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/filterFrags.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,94 @@ +/* + +filterFrags.c + +Exports frags to CSV filtering by thresholds + +Use: filterFrags file.frags length similarity + +*/ + +#include +#include +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "comparisonFunctions.h" +#define MAX_LEVELS 900000 + + +int main(int ac,char** av){ + + + if(ac<4){ + printf("Use: filterFrags \n"); + exit(-1); + } + + // Read fragments + struct FragFile frag; + uint64_t xtotal,ytotal; + //f=readFragmentsv2(av[1],&nf,&xtotal,&ytotal); + + FILE * fFrags = fopen(av[1], "rb"); + if(fFrags == NULL){ fprintf(stderr, "Could not open input file\n"); exit(-1); } + + readSequenceLength(&xtotal, fFrags); + readSequenceLength(&ytotal, fFrags); + + + + + uint64_t min_l = (uint64_t) atoi(av[2]); + double min_sim = (double) atof(av[3])/100; + + /******************************************/ + + // Print header. Frags file info + printf("All by-Identity Ungapped Fragments (Hits based approach)\n"); + printf("[Abr.98/Apr.2010/Dec.2011 -- \n"); + printf("SeqX filename : Unknown\n"); + printf("SeqY filename : Unknown\n"); + printf("SeqX name : Unknown\n"); + printf("SeqY name : Unknown\n"); + printf("SeqX length : %"PRIu64"\n", xtotal); + printf("SeqY length : %"PRIu64"\n", ytotal); + printf("Min.fragment.length : 0\n"); + printf("Min.Identity : 0\n"); + printf("Tot Hits (seeds) : 0\n"); + printf("Tot Hits (seeds) used: 0\n"); + printf("Total fragments : 0\n"); + + printf("========================================================\n"); + + + + printf("Type,xStart,yStart,xEnd,yEnd,strand(f/r),block,length,score,ident,similarity,%%ident,SeqX,SeqY\n"); + + double similarity,likeness; + + while(!feof(fFrags)){ + readFragment(&frag, fFrags); + + similarity=100.0*(((double)frag.score)/((double)frag.length*4.0)); + likeness=(((double)frag.ident)/((double)frag.length)); + + if(frag.strand=='r'){ + frag.yStart = ytotal - frag.yStart - 1; + frag.yEnd = ytotal - frag.yEnd - 1; + } + + + if(similarity >= min_sim && (uint64_t)frag.length >= min_l){ + printf("Frag,%"PRIu64",%"PRIu64",%"PRIu64",%"PRIu64",%c,%"PRId64",%"PRIu64",%"PRIu64",%"PRIu64",%.2f,%.2f,%"PRIu64",%"PRIu64"\n",frag.xStart,frag.yStart,frag.xEnd,frag.yEnd,frag.strand,frag.block,frag.length,frag.score,frag.ident,similarity,likeness,frag.seqX,frag.seqY); + } + + } + + fclose(fFrags); + + return 0; +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/filterHits.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/filterHits.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,84 @@ +/* filterHits + Syntax: filterHits fileIn fileOut dist FilterIsolatedHits + + fileIn Ordered Hits file (Diag/posX/posY) + fileOut Ordered Out file (Diag/posX/posY) + dist Hits that occurs at a less than a distance "dist" from the + previous hit are removed (only the first hit remains) + distance is measure from END-to-Head of 2 hits + FilterIsolatedHits (0=not - 1=Yes) + + ----------------------------------------ortrelles@uma.es /Dic2011---*/ + +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "comparisonFunctions.h" + +int differentSequences(hit h1, hit h2){ + return h1.seqX != h2.seqX || h1.seqY != h2.seqY; +} + +int main(int ac, char** av) { + int wSize = 32; + int64_t diagonal; + uint64_t lastPosition; + uint64_t originalNumberOfHits = 0, finalNumberOfHits = 0, hitsRead = 0; + FILE* fIn, *fOut; + + hit hits[2]; + + if (ac != 4) + terror("USE:filterHits fileIn fileOut wordSize"); + + if ((fIn = fopen(av[1], "rb")) == NULL) + terror("opening HITS input file"); + + if ((fOut = fopen(av[2], "wb")) == NULL) + terror("opening HITS OUT file"); + wSize = atoi(av[3]); + lastPosition = 0; + + hitsRead = fread(&hits[0], sizeof(hit), 1, fIn); + originalNumberOfHits += hitsRead; + if(hitsRead>0) + finalNumberOfHits += fwrite(&hits[0], sizeof(hit), 1, fOut); + diagonal = hits[0].diag; + while (hitsRead > 0) { + hitsRead = fread(&hits[1], sizeof(hit), 1, fIn); + originalNumberOfHits += hitsRead; + + if(differentSequences(hits[0], hits[1])){ + lastPosition = hits[1].posX + (2 * wSize - 1); + diagonal = hits[1].diag; + finalNumberOfHits += fwrite(&hits[1], sizeof(hit), 1, fOut); + memcpy(&hits[0], &hits[1], sizeof(hit)); + continue; + } + + if (hitsRead == 0 && originalNumberOfHits > 1) { + if(diagonal != hits[0].diag || hits[0].posX > (lastPosition)){ + finalNumberOfHits += fwrite(&hits[0], sizeof(hit), 1, fOut); + } + continue; + } + + if (diagonal != hits[1].diag || hits[1].posX > lastPosition) { + lastPosition = hits[1].posX + (2 * wSize - 1); + diagonal = hits[1].diag; + finalNumberOfHits += fwrite(&hits[1], sizeof(hit), 1, fOut); + memcpy(&hits[0], &hits[1], sizeof(hit)); + } + } + + fprintf(stdout, + "\nfilterHits\noriginal number of Hits=%" PRIu64 " Final number of hits=%" PRIu64 "\n", + originalNumberOfHits, finalNumberOfHits); + fclose(fIn); + fclose(fOut); + + return 0; +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/fragStat.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/fragStat.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,106 @@ +/* DEBUG: list the fragments file + * + * Sintax: ./leeFrags fragsFILE [lowUpThresh.file] + * + * optionally it can include a low/Up filter scheme + * + * O.Trelles >ortrelles @ uma.es> + * -----------------------------------------------May 2012 + */ + +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "comparisonFunctions.h" + +#define SCORE 4 // it depends on the score matrix +#define MAXLENGTH 1000 + +void loadThresholds(int** low, int** up, char* thresholdFile) { + int length, lowSimilarity, upSimilarity; + FILE *fThreshold; + + if ((*low = (int*) calloc(MAXLENGTH, sizeof(int))) == NULL) + terror("enought memory for LowThreshold array"); + + if ((*up = (int*) calloc(MAXLENGTH, sizeof(int))) == NULL) + terror("enought memory for UpperThreshold array"); + + if ((fThreshold = fopen(thresholdFile, "rt")) == NULL) + terror("open Low-Upper threshold file"); + + if(fscanf(fThreshold, "%d\t%d\t%d\n", &length, &lowSimilarity, &upSimilarity)!=3){ + terror("Error reading the threshold file. Wrong format intTABintTABint"); + } + while (!feof(fThreshold)) { + if (length >= MAXLENGTH) + terror("LowUp threshold out of range (maxL)"); + *low[length] = lowSimilarity; + *up[length] = upSimilarity; + if(fscanf(fThreshold, "%d\t%d\t%d\n", &length, &lowSimilarity, &upSimilarity)!=3){ + terror("Error reading the threshold file. Wrong format intTABintTABint"); + } + } + fclose(fThreshold); +} + +int main(int ac, char** av) { + FILE* fFrags; + uint64_t n1, n2, nFrags = 0; + struct FragFile frag; + int *low, *up, length; + int similarity = 0, zone; + int flagT = 0; + + if (ac != 3 && ac != 2) + terror("USE: ./leeFrags fragsFILE [lowUpThresh.file (Optional)]"); + if (ac == 3) + flagT = 1; + + // Load Thresholds--------------------------------------- + if (flagT == 1) { + loadThresholds(&low, &up, av[2]); + } + + // prepared for multiple files + if ((fFrags = fopen(av[1], "rb")) == NULL) + terror("Opening Frags binary file"); + + readSequenceLength(&n1, fFrags); + readSequenceLength(&n2, fFrags); + fprintf(stderr, "working with fragsFile=%s SeqX=%" PRIu64 " seqY=%" PRIu64 "\n", av[1], n1, + n2); + + readFragment(&frag, fFrags); + while (!feof(fFrags)) { + length = frag.length; + if (length >= MAXLENGTH) + length = MAXLENGTH - 1; + fprintf(stdout, + "d=%" PRId64 "\tx=%" PRIu64 "\ty=%" PRIu64 "\tL=%" PRIu64 "\tsco=%" PRIu64 "\tseqX=%" PRIu64 "\tseqY=%" PRIu64 "\tSIM=%lf\tident=%" PRIu64 "\tstrand=%c\tblock=%" PRIu64 , + frag.diag, frag.xStart, frag.yStart, frag.length, frag.score, frag.seqX, frag.seqY, + frag.similarity, frag.ident, frag.strand,frag.block); + if (flagT) { + if (similarity > up[length]) + zone = 2; + else if (similarity > low[length]) + zone = 1; + else + zone = 0; + fprintf(stdout, "\tLow=%d\tUp=%d\tZone=%d", low[length], up[length], + zone); + + } + fprintf(stdout, "\n"); + nFrags++; + readFragment(&frag, fFrags); + } + fprintf(stdout, "nFrags:%" PRIu64 "\n", nFrags); + + fclose(fFrags); + + return 0; +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/fragmentv2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/fragmentv2.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,124 @@ +/* + * fragmentv2.c + * + * Created on: 17/07/2013 + * Author: jarjonamedina + */ + +#include +#include +#include +#include +#include +#include + + +#include +#include "structs.h" +#include "comparisonFunctions.h" + +#include "fragmentv2.h" + +//#include "error.h" + + +int writeFragments (struct FragFile* f,char* s,int nf, uint64_t xtotal, uint64_t ytotal){ +/* + FILE* fs; + int n; + + if((fs=fopen(s,"wb"))==NULL){ + printf("***ERROR Opening output file"); + exit(-1); + } + +// +// fwrite(&xtotal,sizeof(int),1,fs); +// fwrite(&ytotal,sizeof(int),1,fs); + + fwrite(&xtotal,sizeof(uint64_t),1,fs); + fwrite(&ytotal,sizeof(uint64_t),1,fs); + + for(n=0;n +#include "structs.h" + + +struct FragFile* readFragmentsv2(char* s,int* nf,uint64_t *xtotal,uint64_t *ytotal); + +int writeFragments(struct FragFile* f,char* s,int nf,uint64_t xtotal, uint64_t ytotal); + + +#endif /* FRAGMENTV2_H_ */ diff -r 000000000000 -r 9db88f0f32b7 gecko/src/fragmentv3.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/fragmentv3.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,98 @@ +/* + * Fragmentv3.c + * + * Created on: 23/02/14 + * Author: jarjonamedina + */ + +#include +#include +#include +#include +#include +#include + +#include "fragmentv3.h" +//#include "error.h" + + +int writeFragmentsv3 (Fragmentv3* f,char* s,int nf, int xtotal, int ytotal){ + + FILE* fs; + int n; + + if((fs=fopen(s,"wb"))==NULL){ + printf("***ERROR Opening output file"); + exit(-1); + } + +// + fwrite(&xtotal,sizeof(int),1,fs); + fwrite(&ytotal,sizeof(int),1,fs); + + for(n=0;n +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "comparisonFunctions.h" + +#define TAB_INSERT 70 + +struct rIndex2 * loadReadsIndex(char * filename, uint64_t * nReads){ + struct rIndex2 * RR; + uint64_t nR=0,i; + FILE *f; + uint64_t fsize; + + if ((f=fopen(filename,"rb"))==NULL) terror("Could not open index input file"); + + fseeko(f, 0, SEEK_END); + fsize = ftello(f); + rewind(f); + nR = fsize/sizeof(struct rIndex2); + + if ((RR =(struct rIndex2*) calloc(nR,sizeof(struct rIndex2)))==NULL) terror("Could not allocate index"); + + for (i=0; ilength, f->ident, f->strand, f->xStart, f->yStart); +} + +void get_both_seqs(char * fastaX, char * fastaY, uint64_t iniX, uint64_t finX, uint64_t iniY, uint64_t finY, uint64_t posX, uint64_t posY, uint64_t LacX, uint64_t LacY, uint64_t lX, uint64_t lY, FILE * fO){ + char copyX[TAB_INSERT+1]; + char copyY[TAB_INSERT+1]; + memset(copyX, 0x0, TAB_INSERT+1); + memset(copyY, 0x0, TAB_INSERT+1); + uint64_t atcopyX = 0; + uint64_t atcopyY = 0; + uint64_t i; + + uint64_t pos_used_x, pos_used_y; + + // The X one + //fseek(fastaX, posX, SEEK_SET); + pos_used_x = posX; + char cX = fastaX[pos_used_x++]; + while(cX != '\n'){ cX = fastaX[pos_used_x++]; } + uint64_t gpX = iniX - LacX; + uint64_t currX = 0, tab_counter; + while(currX < gpX){ + cX = fastaX[pos_used_x++]; + if(cX != '\n') ++currX; + } + + // the other Y + //fseek(fastaY, posY, SEEK_SET); + pos_used_y = posY; + char cY = fastaY[pos_used_y++]; + while(cY != '\n'){ cY = fastaY[pos_used_y++]; } + uint64_t gpY = iniY - LacY; + uint64_t currY = 0; + while(currY < gpY){ + cY = fastaY[pos_used_y++]; + if(cY != '\n') ++currY; + } + + + + + // Reached the region to show + currX = 0; + currY = 0; + cX = fastaX[pos_used_x++]; + cY = fastaY[pos_used_y++]; + //fprintf(fO, "\t"); + tab_counter = 0; + while(currX < lX && currY < lY){ + + if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } + cX = fastaX[pos_used_x++]; + while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; + + + if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } + cY = fastaY[pos_used_y++]; + while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; + + while(currX > currY){ + if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } + cY = fastaY[pos_used_y++]; + while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; + } + while(currX < currY){ + if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } + cX = fastaX[pos_used_x++]; + while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; + } + + ++tab_counter; + + + + if(tab_counter >= TAB_INSERT && currX == currY){ + + copyX[TAB_INSERT] = '\0'; + copyY[TAB_INSERT] = '\0'; + fprintf(fO, "X:\t%.*s\n\t", TAB_INSERT, copyX); + for(i=0; i 0){ + copyX[atcopyX] = '\0'; + copyY[atcopyY] = '\0'; + fprintf(fO, "X:\t%.*s\n\t", (int)atcopyX, copyX); + for(i=0; i + //RI[id].Lac is the sum of the reads length prior + + if(frag.strand == 'f'){ + write_headers(fX, fY, fO, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, &frag); + }else{ + write_headers(fX, fYrev, fO, RI_X[frag.seqX].pos, RI_Yrev[nReads_Y - frag.seqY - 1].pos, &frag); + } + + + //get_seq_from_to(fX, fO, frag.xStart, frag.xEnd, RI_X[frag.seqX].pos, RI_X[frag.seqX].Lac, frag.seqX, frag.length, fO); + + + + if(frag.strand == 'f'){ + get_both_seqs(strfastaX, strfastaY, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, RI_X[frag.seqX].Lac, RI_Y[frag.seqY].Lac, frag.length, frag.length, fO); + //get_seq_from_to(fY, fO, frag.yStart, frag.yEnd, RI_Y[frag.seqY].pos, RI_Y[frag.seqY].Lac, frag.seqY, frag.length, fO); + }else{ + uint64_t seqYnew; + seqYnew = nReads_Y - frag.seqY - 1; + get_both_seqs(strfastaX, strfastaYrev, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Yrev[seqYnew].pos, RI_X[frag.seqX].Lac, RI_Yrev[seqYnew].Lac, frag.length, frag.length, fO); + //get_seq_from_to_rev(fYrev, fO, frag.yStart, frag.yEnd, RI_Yrev[seqYnew].pos, RI_Yrev[seqYnew].Lac, seqYnew, frag.length, fO); + } + + readFragment(&frag, fFrags); + } + + fclose(fFrags); + fclose(fX); + fclose(fY); + fclose(fO); + + free(RI_X); + free(RI_Y); + free(RI_Yrev); + + free(strfastaX); + free(strfastaY); + free(strfastaYrev); + + return 0; +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/getCSB.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/getCSB.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,556 @@ +/* getCSB.c + + + +*/ + + +#include +#include +#include +#include +#include +#include + +#include "fragmentv3.h" +#include "fragmentv2.h" +#include "lista.h" +#define MAX_LEVELS 900000 + +/* Subprograms */ + +void quickSortX(Fragmentv3 *f, int elements); +void quickSortScore(Fragmentv3 *f, int elements); +void quickSortY(Fragmentv3 *f, int elements, int ytotal); +void reorder( Fragmentv3* f, int nf, int ytotal); + +void resetFrags(Fragmentv3* f,int nf); + +Lista ordenarXY(Lista *lista,char c); +void ordenarLista(Lista *lista); + +int S(Fragmentv3 f); // It returns '1' for 'f' strand and '-1' for 'r' strand; +int C(Fragmentv3 a, Fragmentv3 ci,Fragmentv3 cd, Fragmentv3 s); // Collinearity of ant, current, sig. +int L(Fragmentv3 a, Fragmentv3 c); // Linear of ant and current +int CSB(Lista *lista,Fragmentv3 *frags); // Join all posible CSB. return the number of CSB + +int NewList(Lista* lista,Fragmentv3* f,int nf); +void saveCSB(Lista lista,int xtotal,int ytotal); +void saveFragments(Fragmentv3* f,int nf,int xtotal,int ytotal); + +/********************/ + +/* Globals */ +//int dist; +//int size; +char** AV; + +//int MOSTRAR=1; +//int CONTADOR=0; + +// Constantes extensión de gap +int INI=0; +float EXT=3; + + + + +int main(int ac,char** av){ + + AV=av; + if(ac<4){ + printf("Use: getCSB fragsv3.fil csb.fv3 csb.txt \n"); + exit(-1); + } + /***********************/ + /* General vars */ + int size_of_list; + //int i; + + + /* Read Fragments */ + Fragmentv3* f; + int nf,xtotal,ytotal; + f=readFragmentsv3(av[1],&nf,&xtotal,&ytotal); + + + /* Reset frags */ + resetFrags(f,nf); + + /* Order frags */ + reorder(f,nf,ytotal); + + /* Creates a list */ + Lista lista = NULL; + size_of_list=NewList(&lista,f,nf); +//+++++++++++++++++++++++++++ +/************ START PROGRAM ***************************************************************/ + int n=1; + + + // Get CSBs + if(size_of_list>1){ + n=CSB(&lista,f); + } + + + saveFragments(f,nf,xtotal,ytotal); + +return 0; +} + + + + + + + + +/*************************/ +int NewList(Lista* lista,Fragmentv3* f,int nf){ + int i; + int size_of_list=0; + int m; + m=0; + for(i=0;i=0){ + size_of_list++; + + f[i].block=i+1; + f[i].id=i; + Insertar(lista,m++,f[i],0); + + }else{ + f[i].id=-1; + } + //printf("%d\t%ld\t%ld\t%ld\t%d\n",i,f[i].id,f[i].score,f[i].length,f[i].block); + + } + return size_of_list; + } +/***************************************************/ +/***************************************************/ +void saveFragments(Fragmentv3 *frags,int nf,int xtotal,int ytotal){ + +FILE* fs; + if((fs=fopen(AV[2],"wb"))==NULL){ + printf("***ERROR Opening output file %s\n",AV[2]); + exit(-1); + } +FILE* ftxt; + if((ftxt=fopen(AV[3],"w"))==NULL){ + printf("***ERROR Opening output file %s\n",AV[3]); + exit(-1); + } +//printf("abierto %s\n",av[4]); +Fragmentv3 fragv3; + + uint64_t xtotal64,ytotal64; + xtotal64 = xtotal; + ytotal64 = ytotal; + fwrite(&xtotal,sizeof(int),1,fs); + fwrite(&ytotal,sizeof(int),1,fs); +// fwrite(&xtotal,sizeof(int),1,fs); +// fwrite(&ytotal,sizeof(int),1,fs); + + + + + int i; + for(i=0;isiguiente){ + fragv3 = nodo->f; + fwrite(&fragv3,sizeof(Fragmentv3),1,fs); + fprintf(ftxt,"%ld\t%ld\t%ld\t%ld\t%ld\t%c\t%ld\t%d\n",fragv3.xIni,fragv3.yIni,fragv3.xFin,fragv3.yFin,fragv3.length,fragv3.strand,fragv3.score,fragv3.block); + + nodo= nodo->siguiente; + } + + fclose(fs); + fclose(ftxt); + printf("------------FINJ-------------\n"); + //getchar(); +} +/***************************************************/ +int S(Fragmentv3 f){ + + if(f.strand=='f')return 1; + if(f.strand=='r')return -1; + return 0; +} +/***************************************************/ +int C(Fragmentv3 a, Fragmentv3 ci, Fragmentv3 cd, Fragmentv3 s){ + + if( L(a,ci) && L(cd,s) )return 1; + return 0; +} +/***************************************************/ +int L(Fragmentv3 a, Fragmentv3 c){ + + int A,C; + A=a.seqY; + C=c.seqY; + if( ( (A-S(a)) == (C-2*S(c)) ) && (S(c) == S(a)) )return 1; + return 0; +} +/***************************************************/ +int CSB(Lista *lista,Fragmentv3 *frags){ + int n=1; + + + int penal=0; + int joined=1; + + pNodo nodo; + while(joined){ + joined = 0; + nodo = *lista; + while(!nodo->anterior)nodo=nodo->siguiente; + while(nodo->siguiente) { + + + + if( L(nodo->anterior->f,nodo->f) ){ + + penal = INI + EXT*(abs( nodo->anterior->f.xFin- nodo->f.xIni)+abs( nodo->anterior->f.yFin- nodo->f.yIni)); + //printf("penal:%d\n",penal); + + nodo->anterior->f.xFin = nodo->f.xFin; + nodo->anterior->f.yFin = nodo->f.yFin; + //nodo->anterior->f.score = nodo->anterior->f.score + nodo->f.score; + nodo->anterior->f.score = nodo->anterior->f.score + nodo->f.score- penal; + nodo->anterior->f.length = abs(nodo->anterior->f.yFin - nodo->anterior->f.yIni); + + nodo->anterior->f.seqY = nodo->f.seqY; + nodo->anterior->f.seqX = nodo->f.seqX; + //nodo->anterior->f.block=n; + //frags[nodo->f.id].block=n; + //frags[nodo->anterior->f.id].block=n; + frags[nodo->f.id].block=frags[nodo->anterior->f.id].block; + + + + joined = 1; + Borrar(lista, nodo->valor); + + }else{ + + + nodo = nodo->siguiente; + n++; + + } + } + // Para el ultimo + if( L(nodo->anterior->f,nodo->f) ){ + penal = INI + EXT*(abs( nodo->anterior->f.xFin- nodo->f.xIni)+abs( nodo->anterior->f.yFin- nodo->f.yIni)); + //printf("penal:%d\n",penal); + + nodo->anterior->f.xFin = nodo->f.xFin; + nodo->anterior->f.yFin = nodo->f.yFin; + //nodo->anterior->f.score = nodo->anterior->f.score + nodo->f.score; + nodo->anterior->f.score = nodo->anterior->f.score + nodo->f.score- penal; + nodo->anterior->f.length = abs(nodo->anterior->f.yFin - nodo->anterior->f.yIni); + + nodo->anterior->f.seqY = nodo->f.seqY; + nodo->anterior->f.seqX = nodo->f.seqX; + + frags[nodo->f.id].block=frags[nodo->anterior->f.id].block; + + joined = 1; + Borrar(lista, nodo->valor); + } + if(n>=1)ordenarLista(lista); + } + return n; +} +/***************************************************/ +Lista ordenarXY(Lista *lista,char c){ + + Lista nueva=NULL; + pNodo nodo= *lista; + nodo = *lista; +/*******************************************/ +/* Primero buscamos el valor menor, para que sea el primero en insertarse */ + + int valor=0; + pNodo primero=nodo; + + //while(nodo->anterior)nodo=nodo->anterior; + if(c=='x'){valor=nodo->f.seqX;primero=nodo;} + if(c=='y'){valor=nodo->f.seqY;primero=nodo;} + + + while(nodo){ + if(c=='x'){ + if(valor>=nodo->f.seqX && nodo->f.block>=0){ + valor=nodo->f.seqX; + primero=nodo; + } + + }else{ + if(valor>=nodo->f.seqY && nodo->f.block>=0){ + valor=nodo->f.seqY; + primero=nodo; + } + + } + nodo=nodo->siguiente; + } +// if(c=='x'){printf("aqui primero->f.seqX: %d valor:%d\n",primero->f.seqX,valor);} +// if(c=='y'){printf("aqui primero->f.seqY: %d valor:%d\n",primero->f.seqY,valor);} + + + //Insertamos el primero + if(c=='x'){ +// printf("X insertamos en la posicion %d\n",primero->f.seqX); + Insertar(&nueva,primero->f.seqX,primero->f,primero->n_event); + Borrar(lista,primero->valor); + }else{ +/// printf("Y insertamos en la posicion %d\n",primero->f.seqY); + Insertar(&nueva,primero->f.seqY,primero->f,primero->n_event); +// printf("ya hemos insertado\n"); + Borrar(lista,primero->valor); + } + +/*******************************************/ + + nodo = *lista; + while(nodo->anterior)nodo=nodo->anterior; + while(nodo){ + // Insertarmos en el orden de XY + if(c=='x'){ + Insertar(&nueva,nodo->f.seqX,nodo->f,nodo->n_event); + }else{ + Insertar(&nueva,nodo->f.seqY,nodo->f,nodo->n_event); + + } + nodo = nodo->siguiente; + } + + + pNodo newNodo = nueva; + newNodo = nueva; + int i=0; + while(newNodo){ + if(c=='x'){ + + newNodo->f.seqX=i++; + }else{ + newNodo->f.seqY=i++; + } + + newNodo = newNodo->siguiente; + } + + return nueva; + + +} +/****************************************/ +void reorder( Fragmentv3* f, int nf, int ytotal){ + int i; + int j; + + /* Sort Y*/ + quickSortY(&f[0],nf,ytotal); + j=0; + for(i=0;i=0)f[i].seqY=j++; + } + + + /* Sort X*/ + quickSortX(&f[0],nf); + j=0; + for(i=0;i=0)f[i].seqX=j++; + } + + +} +/******/ +void resetFrags(Fragmentv3* f, int nf){ +int i; +for(i=0;i=0) { + L=beg[i]; R=end[i]-1; + if (L=piv.score && Lend[i-1]-beg[i-1]) { + swap=beg[i]; beg[i]=beg[i-1]; beg[i-1]=swap; + swap=end[i]; end[i]=end[i-1]; end[i-1]=swap; }} + else { + i--; }}} +/*****************/ +void quickSortX(Fragmentv3 *f, int elements) { + + + + + int beg[MAX_LEVELS], end[MAX_LEVELS], i=0, L, R, swap ; + Fragmentv3 piv; + beg[0]=0; end[0]=elements; + while (i>=0) { + L=beg[i]; R=end[i]-1; + if (L=piv.xIni && Lend[i-1]-beg[i-1]) { + swap=beg[i]; beg[i]=beg[i-1]; beg[i-1]=swap; + swap=end[i]; end[i]=end[i-1]; end[i-1]=swap; }} + else { + i--; }}} +/*****************/ +void quickSortY(Fragmentv3 *f, int elements,int ytotal) { + + + + int beg[MAX_LEVELS], end[MAX_LEVELS], i=0, L, R, swap ; + Fragmentv3 piv; + +/* +// Change Y component +long ytemp; + for(i=0;i=0) { + L=beg[i]; R=end[i]-1; + if (L=piv.yIni && Lend[i-1]-beg[i-1]) { + swap=beg[i]; beg[i]=beg[i-1]; beg[i-1]=swap; + swap=end[i]; end[i]=end[i-1]; end[i-1]=swap; }} + else { + i--; }} + +// Change Y component +/* + for(i=0;i +#include +#include +#include +#include +#include + +#include "fragmentv3.h" +#include "fragmentv2.h" + +#include "lista.h" +#define MAX_LEVELS 900000 + + +int main(int ac,char** av){ + + + if(ac<2){ + printf("Use: getInfo file.frags\n"); + exit(-1); + } + + // Read fragments + struct FragFile* f; + int nf; // Number of fragments + uint64_t xtotal,ytotal; + nf=0; + f=readFragmentsv2(av[1],&nf,&xtotal,&ytotal); + + /******************************************/ + int i; + + // Print header. Frags file info + printf("Total CSB: 0\n"); + printf("========================================================\n"); + + printf("Type,xStart,yStart,xEnd,yEnd,strand(f/r),block,length,score,ident,similarity,%%ident,SeqX,SeqY\n"); + + double likeness; + + for(i=0;i +#include +#include +#include +#include +#include + +#include "structs.h" +#include "commonFunctions.h" +#include "dictionaryFunctions.h" + +#define PEQ 1001 + +int main(int ac, char** av){ + + char fname[1024], *W; + W=(char *)malloc(33*sizeof(char)); + FILE *f1, *f2, *f3; + hashentry he; + uint64_t i=0; + location spos; + uint64_t nW=0,maxF=0, aveF=0; + int flagV=0; + int64_t freq[PEQ]; + + if(ac<2)terror("USE: leehd prefixNameOUT [v=verbose]\n"); + if (ac==3) flagV=1; + for (i=0;i=PEQ) { + fprintf(f3, "%" PRIu64 "\t", he.num); + showWord(&he.w, W); + fprintf(f3, "%.32s", W); + fprintf(f3, "%" PRIu64 "\n", he.num); + } + else freq[he.num]++; + nW++; + if (he.num>maxF) maxF=he.num; + aveF+=he.num; + + fseek(f2,0, he.pos); + if (flagV) { + + for (i=0;i FreqThr are skipped) + + Feb.6 : new parameter (same meaning as in w2hd): PrefixSize + + May 22 : for long repetitions some of them will be skipped (the step is + computed as NREP / MaxREP + + ortrelles@uma.es / Dic.2011 + ---------------------------------------------------------*/ + +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "dictionaryFunctions.h" +#include "comparisonFunctions.h" + +#define MAXBUF 10000000 +#define MaxREP 10000 + +int main(int ac, char** av) { + + char fname[1024]; + int hitsInBuf = 0; + int i, j; + int comp; + int firstMatch = 0, endMatch = 0; + uint64_t freqThr; + uint64_t nHits = 0, wordMatches = 0; + int wSize; + int stepX, stepY; + long offsetWordBefore, offsetWordAfter, offsetLocationBefore, offsetLocationAfter; + FILE *fX1, *fX2, *fY1, *fY2, *fOut; + + location *posX = NULL, *posY = NULL; + hashentry heX, heY; + hit *hBuf; + + if (ac != 6) + terror("USE: hits prefixNameX prefixNameY Outfile FreqThre PrefixSize"); + freqThr = (uint64_t) atoi(av[4]); + wSize = atoi(av[5]); + + // I-O buffer + if ((hBuf = (hit*) calloc(sizeof(hit), MAXBUF)) == NULL) + terror("HITS: memory for I-O buffer"); + // word positions buffers + if ((posX = (location*) calloc(MAXBUF, sizeof(location))) == NULL) + terror("memory for posX buffer.. using MAXBUF=10MB"); + if ((posY = (location*) calloc(MAXBUF, sizeof(location))) == NULL) + terror("memory for posY buffer.. using MAXBUF=10MB"); + + // Sequence X files + sprintf(fname, "%s.d2hW", av[1]); + if ((fX1 = fopen(fname, "rb")) == NULL) + terror("opening seqX.d2hW file"); + sprintf(fname, "%s.d2hP", av[1]); + if ((fX2 = fopen(fname, "rb")) == NULL) + terror("opening seqX.d2hP file"); + + // Sequence Y files + sprintf(fname, "%s.d2hW", av[2]); + if ((fY1 = fopen(fname, "rb")) == NULL) + terror("opening seqY.d2hW file"); + sprintf(fname, "%s.d2hP", av[2]); + if ((fY2 = fopen(fname, "rb")) == NULL) + terror("opening seqY.d2hP file"); + + // OUT file + if ((fOut = fopen(av[3], "wb")) == NULL) + terror("opening OUT file"); + + // kick-off + if (readHashEntry(&heX, fX1, freqThr) == -1) + terror("no hash (1)"); + if (readHashEntry(&heY, fY1, freqThr) == -1) + terror("no hash (2)"); + + while (!feof(fX1) && !feof(fY1)) { + + comp = wordcmp(&heX.w.b[0], &heY.w.b[0], wSize); + if (comp < 0) { + readHashEntry(&heX, fX1, freqThr); + //Save position of first missmatch after matches and rewind + if(firstMatch){ + offsetWordAfter = ftell(fY1) - sizeof(hashentry); + offsetLocationAfter = ftell(fY2); + fseek(fY1, offsetWordBefore, SEEK_SET); + readHashEntry(&heY, fY1, freqThr); + fseek(fY2, offsetLocationBefore, SEEK_SET); + firstMatch = 0; + endMatch = 1; + } + continue; + } + if (comp > 0) { + //No more matches, go to the next word + if(endMatch){ + fseek(fY1, offsetWordAfter, SEEK_SET); + fseek(fY2, offsetLocationAfter, SEEK_SET); + endMatch = 0; + } + readHashEntry(&heY, fY1, freqThr); + continue; + } + + wordMatches++; + + // Load word position for seqX + if(!firstMatch)loadWordOcurrences(heX, &posX, &fX2); + + // Saving the offset of the first match + if(wSize < 32 && !firstMatch){ + offsetWordBefore = ftell(fY1) - sizeof(hashentry); + offsetLocationBefore = ftell(fY2); + firstMatch = 1; + } + + // Load word position for seqY + loadWordOcurrences(heY, &posY, &fY2); + + // Hits----------------------- + if (heX.num > MaxREP) + stepX = heX.num / MaxREP; + else + stepX = 1; + if (heY.num > MaxREP) + stepY = heY.num / MaxREP; + else + stepY = 1; + + for (i = 0; i < heX.num; i += stepX) + for (j = 0; j < heY.num; j += stepY) { + hBuf[hitsInBuf].diag = posX[i].pos - posY[j].pos; + hBuf[hitsInBuf].posX = posX[i].pos; + hBuf[hitsInBuf].seqX = posX[i].seq; + hBuf[hitsInBuf].posY = posY[j].pos; + hBuf[hitsInBuf].seqY = posY[j].seq; + + hitsInBuf++; + if (hitsInBuf == MAXBUF - 1) { + fwrite(hBuf, sizeof(hit), hitsInBuf, fOut); + hitsInBuf = 0; + } + } + + nHits += ((heX.num / stepX) * (heY.num / stepY)); + + if(!firstMatch)readHashEntry(&heX, fX1, freqThr); + readHashEntry(&heY, fY1, freqThr); + + } + + //Closing dictionary files + fclose(fX1); + fclose(fY1); + fclose(fX2); + fclose(fY2); + + //Checking if there is something still at the buffer + if (hitsInBuf != 0) { + fwrite(hBuf, sizeof(hit), hitsInBuf, fOut); + } + fclose(fOut); + + fprintf(stdout, "HITS: matches=%" PRIu64 " Tot HITS=%" PRIu64 "\n", wordMatches, nHits); + + return 0; +} + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/hitsStat.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/hitsStat.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,47 @@ +/* lee Hits file (diag/posX/posY) + + Syntax: leeHits InputHITSFile + --------------------------------------------------------*/ +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" + +int main(int ac, char**av) +{ + FILE *f1; + hit H; + unsigned long nHits; + //--------------------------------- + + if (ac!=2) + terror("Use: LeeHits InputFile]"); + + if ((f1=fopen(av[1],"rb"))==NULL) + terror("Input file open error"); + + fseek(f1,0, SEEK_END); + nHits = ftell(f1)/sizeof(hit); + fprintf(stdout,"File size=%ld nHits=%ld\n", nHits * sizeof(hit), nHits); + fprintf(stdout,"---------------[RET]\n");fgetc(stdin); + fseek(f1,0, SEEK_SET); + + if(fread(&H,sizeof(hit),1,f1)!=1){ + terror("Empty hits file"); + } + while(!feof(f1)){ + fprintf(stdout,"d=%-7" PRId64 " pX=%-7" PRIu64 " pY=%-7" PRIu64 " seqX=%-7" PRIu64 " seqY=%-7" PRIu64 "\n",H.diag,H.posX,H.posY,H.seqX,H.seqY); + if(fread(&H,sizeof(hit),1,f1)!=1){ + if(ferror(f1))terror("Empty hits file"); + } + } + + fclose(f1); + return 0; + +} + + + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/indexmaker.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/indexmaker.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,133 @@ + +/********************* +File indexmaker.c +Author Bitlab +Description Computes a series of statistics and stores them in natural order and sorted order + +USAGE The fasta sequences + The output name for the two indexes + * ------------------------------------------*/ + +#include +#include +#include +#include +#include +#include "structs.h" +#include "comparisonFunctions.h" +#include "commonFunctions.h" + + +void copyRead(struct rIndex2 *To, struct rIndex2 *From); +void QsortRead(struct rIndex2 *array,uint64_t x, uint64_t y); + +int main(int ac, char **av) { + + FILE *f, *fOut; + struct rIndex2 R;//, *RR; + //uint64_t *positions; + uint64_t c; + uint64_t tLen=0, tLmask=0, tN=0, nReads=0; + uint64_t maxRlen=0, maxRlenMasked=0, maxNonACGT=0; + uint64_t N=0, i; + //char tmp[MAXLID]; + + if (ac!=3) terror("USE: indexmaker "); + + + if ((f=fopen(av[1],"rt"))==NULL) terror("Could not open sequences fasta file"); + + if ((fOut=fopen(av[2],"wb"))==NULL) terror("Could not open output file"); + + R.Lac=0; + R.rLen=0; + c=fgetc(f); + while (!feof(f)) { + while(c!='>') c=(char)fgetc(f); // only first time + R.pos=ftell(f) - 1; + i=0; + c=getc(f); + if(i==0 && c== ' ') c=getc(f); //Handle sequences with a ' ' after the '>' + while(i< MAXLID && c!='\n' && c!=' ') { + R.id[i++] = c; + c=getc(f); + } + R.id[i]=0x00; + + while(c!='\n') c=getc(f); + + i=0; + R.rLen=0; + R.rLmasked=0; + R.nonACGT=0; + + while(c!='>' && !feof(f)) { + if (c=='N' || c=='n') R.nonACGT++; + else if (c>96 && c<123) R.rLmasked++; + c=toupper(c); + if (c>64 && c<91) R.rLen++; + if (c>47 && c<58) R.rLen++; //Handling color space reads + + c=(char)fgetc(f); + } + R.rNumber = N; + if (R.rLen > maxRlen) maxRlen=R.rLen; + if (R.rLmasked > maxRlenMasked) maxRlenMasked = R.rLmasked; + if (R.nonACGT > maxNonACGT) maxNonACGT = R.nonACGT; + + fwrite(&R,sizeof(struct rIndex2),1,fOut); + + N++; + + R.Lac +=R.rLen; + tLen +=R.rLen; + tN +=R.nonACGT; + tLmask+=R.rLmasked; + + nReads++; + + } + + fclose(f); + fclose(fOut); + + fprintf(stdout,"....done\n"); + return 0; +} + +void copyRead(struct rIndex2 *To, struct rIndex2 *From){ + + strcpy(To->id, From->id); + To->rNumber = From->rNumber; + To->rLen = From->rLen; + To->rLmasked= From->rLmasked; + To->nonACGT = From->nonACGT; + To->pos = From->pos; //Start position of read + To->Lac = From->Lac; // accumulated length +} + + +void QsortRead(struct rIndex2 *array,uint64_t x, uint64_t y) { + + struct rIndex2 pivote, aux; + uint64_t x1, y1; + + copyRead(&pivote, &array[(x+y)/2]); + x1 = x; + y1 = y; + + do{ + while (strcmp(pivote.id, array[x1].id)>0) x1++; + while (strcmp(pivote.id, array[y1].id)<0) y1--; + if (x1 < y1) { + copyRead(&aux,&array[x1]); + copyRead(&array[x1], &array[y1]); + copyRead(&array[y1], &aux); + x1++; + y1--; + }else if (x1 == y1) x1++; + } while (x1 <=y1); + + if (x +#include +#include +#include +#include +#include + +#include "lista.h" + +void Insertar(Lista *lista, int v,Fragmentv3 f,int n_event) { + pNodo nuevo, actual; + + /* Crear un nodo nuevo */ + nuevo = (pNodo)malloc(sizeof(tipoNodo)); + nuevo->valor = v; + nuevo->f=f; + nuevo->n_event=n_event; + + /* Colocamos actual en la primera posición de la lista */ + actual = *lista; + if(actual) while(actual->anterior) actual = actual->anterior; + + /* Si la lista está vacía o el primer miembro es mayor que el nuevo */ + if(!actual || actual->valor > v) { + /* Añadimos la lista a continuación del nuevo nodo */ + nuevo->siguiente = actual; + nuevo->anterior = NULL; + if(actual) actual->anterior = nuevo; + if(!*lista) *lista = nuevo; + } + else { + /* Avanzamos hasta el último elemento o hasta que el siguiente tenga + un valor mayor que v */ + while(actual->siguiente && actual->siguiente->valor <= v) + actual = actual->siguiente; + /* Insertamos el nuevo nodo después del nodo anterior */ + nuevo->siguiente = actual->siguiente; + actual->siguiente = nuevo; + nuevo->anterior = actual; + if(nuevo->siguiente) nuevo->siguiente->anterior = nuevo; + } +} + +void Borrar(Lista *lista, int v) { + pNodo nodo; + + + /* Buscar el nodo de valor v */ + nodo = *lista; + while(nodo && nodo->valor siguiente; + while(nodo && nodo->valor > v) nodo = nodo->anterior; + + /* El valor v no está en la lista */ + if(!nodo || nodo->valor != v) return; + + /* Borrar el nodo */ + /* Si lista apunta al nodo que queremos borrar, apuntar a otro */ + if(nodo == *lista){ + if(nodo->anterior) {*lista = nodo->anterior;} + else {*lista = nodo->siguiente;} + } + if(nodo->anterior) /* no es el primer elemento */ + nodo->anterior->siguiente = nodo->siguiente; + if(nodo->siguiente) /* no es el último nodo */ + nodo->siguiente->anterior = nodo->anterior; + free(nodo); +} +void BorrarLista(Lista *lista) { + pNodo nodo, actual; + + actual = *lista; + while(actual->anterior) actual = actual->anterior; + + while(actual) { + nodo = actual; + actual = actual->siguiente; + free(nodo); + } + *lista = NULL; +} + +void MostrarLista(Lista lista, int orden,char* nombre,int num) { + pNodo nodo = lista; + int i=0; + if(!lista) printf("Lista vacía"); + + nodo = lista; + if(orden == 1) { + while(nodo->anterior) nodo = nodo->anterior; + // printf("Orden ascendente: "); + while(nodo) { + // printf("%d -> ", nodo->valor); + printFrag(nodo->f,nombre,i++,num,nodo->n_event); + nodo = nodo->siguiente; + } + } + else { + while(nodo->siguiente) nodo = nodo->siguiente; + // printf("Orden descendente: "); + while(nodo) { + printFrag(nodo->f,nombre,i++,num,nodo->n_event); + nodo = nodo->anterior; + } + } + + //printf("\n"); +} + +void printFrag(Fragmentv3 f,char* nombre,int i,int num,int n_event){ + + + fprintf(NULL,"%d\t%s\t%d\t%ld\t%ld\t%ld\t%ld\t%f\t%d\t%d\t%d\t%d\t%d\t%c\t%d\t%d\t%d\t%d\t%d\n",i,nombre,(int)num,f.xIni,f.xFin,f.yIni,f.yFin,f.similarity,(int)f.id,(int)f.seqY,(int)f.seqX,(int)f.block,(int)f.gap,f.strand,(int)f.reversions,(int)f.translocations,(int)f.duplications,(int)f.events,(int)f.score); + printf("%ld\t%ld\t%d\n",f.seqY,f.seqX,n_event); + + +} + + +void CopiarListas(Lista* origen,Lista* dest){ // SUponemos las dos listas mismo tamaño + + pNodo onodo = *origen; + pNodo dnodo = *dest; + while(onodo->anterior && dnodo->anterior) {onodo = onodo->anterior;dnodo = dnodo->anterior;} + + while(onodo && dnodo) { + /* + dnodo->f=*(onodo->f); + dnodo->valor=*(onodo->valor); + */ + + dnodo = dnodo->siguiente; + onodo = onodo->siguiente; + + } + + +} + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/lista.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/lista.h Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,33 @@ +/*lista.h + + +*/ + + +#ifndef LISTA_H_ +#define LISTA_H_ + +#include "fragmentv3.h" + +typedef struct _nodo { + Fragmentv3 f; + int valor; + int n_event; + struct _nodo *siguiente; + struct _nodo *anterior; +} tipoNodo; + +typedef tipoNodo *pNodo; +typedef tipoNodo *Lista; + + +void Insertar(Lista *lista, int v,Fragmentv3 f,int n_event); +void Borrar(Lista *lista, int v); +void printFrag(Fragmentv3 f,char* nombre,int i,int num,int n_event); +void BorrarLista(Lista *lista); +void CopiarListas(Lista* origen,Lista* dest); // SUponemos las dos listas mismo tamaño + + + + +#endif /* LISTA_H_ */ diff -r 000000000000 -r 9db88f0f32b7 gecko/src/matrix.mat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/matrix.mat Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,6 @@ +ACGT +ACGT +4 -4 -4 -4 +-4 4 -4 -4 +-4 -4 4 -4 +-4 -4 -4 4 diff -r 000000000000 -r 9db88f0f32b7 gecko/src/quicksort.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/quicksort.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,392 @@ +#include +#include +#include +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "quicksort.h" + +#define BUF_SIZE 1024 +#define SWAP(a,b,t) t=a; a=b; b=t; +#define STRING_SIZE 1024 + +typedef struct { + BaseType* a; + int l; + int r; + int nth; +} PQSortArgs; + +typedef struct { + char fin1[STRING_SIZE]; + char fin2[STRING_SIZE]; + char fout[STRING_SIZE]; +} PMergeArgs; + +void assertNotNull(void* p, char* msg){ + if(p==NULL){ + terror(msg); + } +} + +void assertIntEQ(int a, int b, char* msg){ + if(a!=b){ + terror(msg); + } +} + +void assertIntGE(int a, int b, char* msg){ + if(afin1,"rb"); + assertNotNull(fin1,args->fin1); + + FILE* fin2 = fopen(args->fin2,"rb"); + assertNotNull(fin2,args->fin2); + + FILE* fout = fopen(args->fout,"wb"); + assertNotNull(fout,args->fout); + + BaseType *a1 = (BaseType*)calloc(BUF_SIZE,sizeof(BaseType)); + assertNotNull(a1,"calloc"); + + BaseType *a2 = (BaseType*)calloc(BUF_SIZE,sizeof(BaseType)); + assertNotNull(a2,"calloc"); + + BaseType *m = (BaseType*)calloc(2*BUF_SIZE,sizeof(BaseType)); + assertNotNull(m,"calloc"); + + int i,j,k; + int r1,r2,tmp; + + r1=fread(a1,sizeof(BaseType),BUF_SIZE,fin1); + r2=fread(a2,sizeof(BaseType),BUF_SIZE,fin2); + + i=j=k=0; + while(r1>0 && r2>0){ + while(i=2*BUF_SIZE){ + tmp=fwrite(m,sizeof(BaseType),k,fout); + assertIntEQ(tmp,k,"fwrite"); + k=0; + } + } + + if(i>=r1){ + r1=fread(a1,sizeof(BaseType),BUF_SIZE,fin1); + i=0; + } + + if(j>=r2){ + r2=fread(a2,sizeof(BaseType),BUF_SIZE,fin2); + j=0; + } + } + + if(k>0){ + tmp=fwrite(m,sizeof(BaseType),k,fout); + assertIntEQ(tmp,k,"fwrite"); + } + + if(ifin1),0,args->fin1); + assertIntEQ(unlink(args->fin2),0,args->fin2); + + pthread_exit(NULL); +} + +int partition(BaseType* a, int l, int r) { + int i=l; + int j=r+1; + BaseType t; + + // l sera el pivote + // y contendra la mediana de l, r y (l+r)/2 + int mid = (l+r)/2; + + if(GT(a[mid],a[r])) { + SWAP(a[mid],a[r],t); + } + + if(GT(a[mid],a[l])) { + SWAP(a[mid],a[l],t); + } + + if(GT(a[l],a[r])) { + SWAP(a[l],a[r],t); + } + + while (1) { + do{ + ++i; + }while( !GT(a[i],a[l]) && i <= r ); + + do{ + --j; + }while( GT(a[j],a[l]) && j >= l); + + if( i >= j ) break; + + SWAP(a[i],a[j],t) + } + + SWAP(a[l],a[j],t) + + return j; +} + +int QsortC(BaseType* a, int l,int r) { + int j; + + if( l < r ) { + // divide and conquer + j = partition( a, l, r); + // j=(l+r)/2; + QsortC( a, l, j-1); + QsortC( a, j+1, r); + } + return 0; +} + +void *PQSort(void* a){ + PQSortArgs *args=(PQSortArgs*)a; + if(args->nth>1){ + int tmp; + int j = partition(args->a,args->l,args->r); + int np=1; + if(args->r - args->l > 0) + np = (args->nth*(j-args->l))/(args->r-args->l); + if(np<1) np=1; + if(np>=args->nth) np=args->nth-1; + //printf("%d\t%d (%d)\t%d (%d)\n",args->r-args->l,j-args->l,np,args->r-j,args->nth-np); + + pthread_t* th = (pthread_t*)calloc(2,sizeof(pthread_t)); + assertNotNull(th,"calloc"); + + PQSortArgs* nargs = (PQSortArgs*)calloc(2,sizeof(PQSortArgs)); + assertNotNull(args,"calloc"); + + nargs[0].a=args->a; + nargs[0].l=args->l; + nargs[0].r=j-1; + nargs[0].nth=np; + tmp=pthread_create(th,NULL,PQSort,(void*)(nargs)); + assertIntEQ(tmp,0,"pthread_create"); + + nargs[1].a=args->a; + nargs[1].l=j+1; + nargs[1].r=args->r; + nargs[1].nth=args->nth-np; + tmp=pthread_create(th+1,NULL,PQSort,(void*)(nargs+1)); + assertIntEQ(tmp,0,"pthread_create"); + + tmp=pthread_join(th[0],NULL); + assertIntEQ(tmp,0,"pthread_join"); + tmp=pthread_join(th[1],NULL); + assertIntEQ(tmp,0,"pthread_join"); + + free(th); + free(nargs); + }else{ + QsortC(args->a,args->l,args->r); + } + pthread_exit(NULL); +} + +unsigned long timestart(){ + struct timeval tv; + + gettimeofday(&tv,NULL); + + return (tv.tv_usec/1000) + (tv.tv_sec*1000); +} + +unsigned long timestop(unsigned long start){ + struct timeval tv; + + gettimeofday(&tv,NULL); + + return (tv.tv_usec/1000) + (tv.tv_sec*1000) - start; +} + +int psort(int maxsize, int nproc, char* ifile, char* ofile){ + int tmp; + int max=maxsize; + int np=nproc; + if(np<1) np=1; + + printf("Allocating %lu bytes.\n",max*sizeof(BaseType)); + BaseType* a = (BaseType*)calloc(max,sizeof(BaseType)); + assertNotNull(a,"calloc"); + + FILE* fin=fopen(ifile,"rt"); + assertNotNull(fin,ifile); + + printf("Stage1: Quicksorts\n"); + unsigned long t = timestart(); + //Read + Quicksort + Write: + char fname[strlen(ofile)+10]; + int nfile=0; + + do { + //Read: + // printf("Reading...\n"); + int n=fread(a,sizeof(BaseType),max,fin); + if(n==0) break; + + //Quicksort: + // printf("Quicksort %d\n",n); + pthread_t th; + PQSortArgs args; + + args.a=a; + args.l=0; + args.r=n-1; + args.nth=np; + tmp=pthread_create(&th,NULL,PQSort,(void*)(&args)); + assertIntEQ(tmp,0,"pthread_create"); + + //Wait: + tmp=pthread_join(th,NULL); + assertIntEQ(tmp,0,"pthread_join"); + + //Write: + // printf("Writing...\n"); + sprintf(fname,"%s.%06d",ofile,nfile); + FILE* fout=fopen(fname,"wb"); + assertNotNull(fout,"fname"); + + tmp=fwrite(a,sizeof(BaseType),n,fout); + assertIntEQ(tmp,n,"fwrite"); + + tmp=fclose(fout); + assertIntEQ(tmp,0,"fclose"); + + nfile++; + }while(!feof(fin)); + + tmp=fclose(fin); + assertIntEQ(tmp,0,"fclose"); + + free(a); + + tmp=unlink(ifile); + assertIntEQ(tmp,0,"unlink"); + + printf("Stage1: %lu\n\n",timestop(t)); + + //Merge: + printf("Stage2: Merge\n"); + t = timestart(); + + int curf=0; + int i=0; + int j=0; + pthread_t th[np]; + PMergeArgs args[np]; + + curf=0; + while(nfile-curf>1){ + j=0; + while(curf %d\n",curf,curf+1,nfile+j); + tmp=pthread_create(th+j,NULL,PMerge,(void*)(args+j)); + assertIntEQ(tmp,0,"pthread_create"); + curf+=2; + j++; + } + + for(i=0;i0){ + tmp=rename(fname,ofile); + assertIntEQ(tmp,0,"rename"); + } else { + FILE *fout=fopen(ofile, "wb"); + assertNotNull(fout,"fopen"); + fclose(fout); + } + + printf("Stage3: %lu\n",timestop(t)); + + return 0; + +} diff -r 000000000000 -r 9db88f0f32b7 gecko/src/quicksort.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/quicksort.h Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,18 @@ +#ifndef QUICKSORT_H +#define QUICKSORT_H + +/** + * Function to order the ifile using a buffer size of + * 'maxsize' and a thread number of 'nproc'. the result + * will be written to ofile + */ +int psort(int maxsize, int nproc, char* ifile, char* ofile); + +/** + * Function to determine if object 1 is strictly greater than 2. + * Returns 1 if true, 0 otherwise + * Has to be defined in the main procedure + */ +int GT(BaseType a1, BaseType a2); + +#endif /* QUICKSORT_H */ diff -r 000000000000 -r 9db88f0f32b7 gecko/src/reverseComplement.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/reverseComplement.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,120 @@ +/* reverseComplement.c + Program reading a FASTA file as input (one or multiple sequence) + and then returning the reverse complementary sequence in the output file. + + It is important to note that for multiple sequence file the first input + sequence is the last one at the output file and the last one in the input + is the first one in the output. + oscart@uma.es + */ + +#include +#include +#include +#include +#include +#include "commonFunctions.h" + +#define SEQSIZE 2000000000 +#define WSIZE 32 +#define NREADS 1000000 + +int main(int ac, char** av) { + FILE *fIn, *fOut; + int64_t i, j, nR, seqLen = 0; + char *seq, c, toW; + long offset[NREADS]; + + if (ac != 3) + terror("USE: reverseComplement seqFile.IN reverseComplementarySeq.OUT"); + + /** + * Allocating memory for the sequence + * Fixed amount of memory, change it to loadSeqDB? + */ + if ((seq = (char*) malloc(SEQSIZE)) == NULL) + terror("memory for Seq"); + + if ((fIn = fopen(av[1], "rt")) == NULL) + terror("opening IN sequence FASTA file"); + + if ((fOut = fopen(av[2], "wt")) == NULL) + terror("opening OUT sequence Words file"); + + for(i=0;i'){ + offset[nR++] = ftell(fIn)-1; + } + c = fgetc(fIn); + } + + for(i=nR-1; i>=0; i--){ + fseek(fIn, offset[i], SEEK_SET); + //READ and write header + if(fgets(seq, SEQSIZE, fIn)==NULL){ + terror("Empty file"); + } + fprintf(fOut, "%s", seq); + //READ and write sequence + c = fgetc(fIn); + while(c != '>' && !feof(fIn)){ + if(isupper(c) || islower(c)){ + seq[seqLen++]=c; + } + c = fgetc(fIn); + } + for(j=seqLen-1; j >= 0; j--){ + switch(seq[j]){ + case 'A': + toW = 'T'; + break; + case 'C': + toW = 'G'; + break; + case 'G': + toW = 'C'; + break; + case 'T': + toW = 'A'; + break; + case 'U': + toW = 'A'; + break; + case 'a': + toW = 't'; + break; + case 'c': + toW = 'g'; + break; + case 'g': + toW = 'c'; + break; + case 't': + toW = 'a'; + break; + case 'u': + toW = 'a'; + break; + default: + toW = seq[j]; + break; + } + fwrite(&toW, sizeof(char), 1, fOut); + } + toW='\n'; + fwrite(&toW, sizeof(char), 1, fOut); + seqLen=0; + } + + fclose(fIn); + fclose(fOut); + + return 0; +} + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/sortHits.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/sortHits.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,26 @@ +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "quicksort.h" + +int GT(BaseType a1, BaseType a2) { + if(a1.diag > a2.diag) + return 1; + else if (a1.diag < a2.diag) + return 0; + if (a1.posX > a2.posX) + return 1; + + return 0; +} + +int main(int ac, char** av) { + if (ac < 4) { + printf("USE: sortHits \n"); + exit(1); + } + + return psort(atoi(av[1]), atoi(av[2]), av[3], av[4]); +} + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/sortWords.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/sortWords.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,46 @@ +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "quicksort.h" + +int GT(BaseType a1, BaseType a2){ + if(a1.w.b[0] > a2.w.b[0]) return 1; + else if(a1.w.b[0] < a2.w.b[0]) return 0; + + if(a1.w.b[1] > a2.w.b[1]) return 1; + else if(a1.w.b[1] < a2.w.b[1]) return 0; + + if(a1.w.b[2] > a2.w.b[2]) return 1; + else if(a1.w.b[2] < a2.w.b[2]) return 0; + + if(a1.w.b[3] > a2.w.b[3]) return 1; + else if(a1.w.b[3] < a2.w.b[3]) return 0; + + if(a1.w.b[4] > a2.w.b[4]) return 1; + else if(a1.w.b[4] < a2.w.b[4]) return 0; + + if(a1.w.b[5] > a2.w.b[5]) return 1; + else if(a1.w.b[5] < a2.w.b[5]) return 0; + + if(a1.w.b[6] > a2.w.b[6]) return 1; + else if(a1.w.b[6] < a2.w.b[6]) return 0; + + if(a1.w.b[7] > a2.w.b[7]) return 1; + else if(a1.w.b[7] < a2.w.b[7]) return 0; + + if(a1.seq > a2.seq) return 1; + else if(a1.seq < a2.seq) return 0; + + if(a1.pos > a2.pos) return 1; + return 0; +} + +int main(int ac, char** av){ + if(ac<4) { + terror("USE: sortWords \n"); + } + + return psort(atoi(av[1]),atoi(av[2]),av[3],av[4]); +} + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/structs.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/structs.h Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,130 @@ +#ifndef STRUCTS_H +#define STRUCTS_H + +#include +//Structs required for the dotplot workflow +#define MAXLID 200 +#define MAXLS 1000000000 +#define READBUF 50000000 //50MB + +//Struct for words program +typedef struct { + //Each letter is stored using 2 bits + //We have 4 letters per byte and a + //maximum of 32 in 'b' + unsigned char b[8]; +} word; + +//Struct for words and sort program +typedef struct { + //Word compressed in binary format + word w; + //Ocurrence position in the sequence + uint64_t pos; + //For multiple sequence files this var + //reflects in what sequence occurs the + //word + uint64_t seq; +} wentry; + +//Struct for w2hd program +typedef struct { + //Word compressed in binary format + word w; + //Ocurrence position in the sequence + uint64_t pos; + //Number of ocurrences inside the + //sequence. This is used to know the + //number of locations stored in the + //positions file + uint64_t num; +} hashentry; + +//Struct for w2hd program +typedef struct { + //Ocurrence position in the sequence + uint64_t pos; + //For multiple sequence files this var + //reflects in what sequence occurs the + //word + uint64_t seq; +} location; + +//Struct for hits, sortHits and filterHits programs +typedef struct { + //Diagonal where the hit is located + //This value is calculated as: + //posX - posY + int64_t diag; + //Ocurrence position in sequence X + uint64_t posX; + //Ocurrence position in sequence Y + uint64_t posY; + //For multiple sequence files this var + //reflects in what sequence of X file + //occurs the word + uint64_t seqX; + //For multiple sequence files this var + //reflects in what sequence of Y file + //occurs the word + uint64_t seqY; +} hit; + +//Struct for FragHits, af2png and leeFrag programs +struct FragFile { + //Diagonal where the frag is located + //This value is calculated as: + //posX - posY + int64_t diag; + //Start position in sequence X + uint64_t xStart; + //Start position in Sequence Y + uint64_t yStart; + //End position in Sequence X + uint64_t xEnd; + //End position in Sequence Y + uint64_t yEnd; + //Fragment Length + //For ungaped aligment is: + //xEnd-xStart+1 + uint64_t length; + //Number of identities in the + //fragment + uint64_t ident; + //Score of the fragment. This + //depends on the score matrix + //used + uint64_t score; + //Percentage of similarity. This + //is calculated as score/scoreMax + //Where score max is the maximum + //score possible + float similarity; + //sequence number in the 'X' file + uint64_t seqX; + //sequence number in the 'Y' file + uint64_t seqY; + //synteny block id + int64_t block; + //'f' for the forward strain and 'r' for the reverse + char strand; +}; + +//Struct for leeSeqDB function +struct Sequence{ + char ident[MAXLID+1]; + char *datos; +}; + +//Struct for reads index tuple +struct rIndex2 { + char id[MAXLID]; + uint64_t rNumber; + uint64_t rLen; + uint64_t rLmasked; //Masked positions + uint64_t nonACGT; //N's + uint64_t pos; //Start position of sequence + uint64_t Lac; //Accumulated length +}; + +#endif diff -r 000000000000 -r 9db88f0f32b7 gecko/src/w2hd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/w2hd.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,81 @@ +/* creates a hash table in disk from a set of ordered words + Syntax: w2hd wordsSort.In prefixNameOUT + + wordsSort is a bin file with Word-Pos-Seq + prefixNameOUT.h2dW : index of words-Pos-Ocurrences + prefixNameOUT.h2dP : positions(Pos+seq) + + Feb.2011: add a new parameter: PrefixSize + + PrefixSize: defines the word-prefix size to be used to identify when two + words are the "same" + + ortrelles@uma.es / Dic.2011 + ---------------------------------------------------------*/ + +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "dictionaryFunctions.h" + +int main(int ac, char** av){ + + char fname[1024]; + uint64_t nWords=0; + FILE* fw, *fOut1, *fOut2; + + wentry we; + hashentry he; + location loc; + + if(ac!=3)terror("USE: w2hd wordsSort.In prefixNameOUT\n"); + + if ((fw = fopen(av[1],"rb"))==NULL) terror("opening IN file"); + + sprintf(fname,"%s.d2hW",av[2]); + if ((fOut1 = fopen(fname,"wb"))==NULL) terror("opening prefix.d2hW file"); + sprintf(fname,"%s.d2hP",av[2]); + if ((fOut2 = fopen(fname,"wb"))==NULL) terror("opening prefix.d2hP file"); + + if(fread(&we,sizeof(wentry),1,fw)!=1){ + terror("empty words file"); + } + memcpy(&he.w.b[0],&we.w.b[0],8); + he.pos=0; + he.num=0; + + while(!feof(fw)){ + loc.pos=we.pos; + loc.seq=we.seq; + if (wordcmp(&he.w.b[0],&we.w.b[0],32)!=0) { + fwrite(&he,sizeof(hashentry),1,fOut1); + memcpy(&he.w.b[0],&we.w.b[0],8); + he.pos=ftell(fOut2); + he.num=0; + } + + fwrite(&loc,sizeof(location),1,fOut2); + he.num++; + nWords++; + + if(fread(&we,sizeof(wentry),1,fw)!=sizeof(wentry)){ + if(ferror(fw))terror("error reading words file"); + } + } + + fwrite(&he,sizeof(hashentry),1,fOut1); + + fprintf(stdout,"\nw2hd: %s tot words=%" PRIu64 "\n",av[1],nWords); + + fclose(fOut1); + fclose(fOut2); + fclose(fw); + + return 0; +} + + + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/words.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/words.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,123 @@ +/* words.c + It is recommended to mask the Low-Complexity regions before using this program + + This program generates a set of 32-mers for the given input sequence. + + Usage: "./words seq.IN words.OUT + where seq.IN is a plain-text sequence + words.OUT is a binary file of "wentry" structures with the 32-mers + ----------------------------------------------------- + oscart@uma.es + */ + +#include +#include +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "dictionaryFunctions.h" + +#define WORD_SIZE 32 +#define BYTES_IN_WORD 8 + +void shift_word(word * w){ + int i; + for(i=0;ib[i]<<=2; + w->b[i]|=(w->b[i+1]>>6); + } + w->b[BYTES_IN_WORD-1]<<=2; +} + +void main_FILE(char * inFile, char * outFile){ + FILE *f; + FILE *f2; + char c; + char *seq = NULL; + uint64_t i = 0, r = 0; + + if ((f=fopen(inFile,"rt"))==NULL){ + perror("opening sequence file"); + } + if ((f2=fopen(outFile,"wb"))==NULL) { + terror("opening OUT sequence Words file"); + } + if ((seq = calloc(READBUF, sizeof(char))) == NULL) { + terror("not enough memory for read buffer"); + } + + //To force the read + i = READBUF + 1; + + c = buffered_fgetc(seq, &i, &r, f); + while (c != '\n') { + c = buffered_fgetc(seq, &i, &r, f); + } + + wentry WE; + WE.seq=0; + unsigned long index=0; + unsigned long inEntry=0; + unsigned long NW=0; + unsigned long Tot=0; + unsigned long NoACGT=0; + unsigned long NoC=0; + + c = buffered_fgetc(seq, &i, &r, f); + while (!feof(f) || (feof(f) && i < r)){ + if (!isupper(toupper(c))){ + if(c=='>'){ + c = buffered_fgetc(seq, &i, &r, f); + while (c != '\n') + c = buffered_fgetc(seq, &i, &r, f); + WE.seq++; + inEntry=0; + index++; + } + NoC++; + c = buffered_fgetc(seq, &i, &r, f); + continue; + } + shift_word(&WE.w); + switch (c) { + case 'A': inEntry++; break; + case 'C': + WE.w.b[BYTES_IN_WORD-1]|=1; + inEntry++; + break; + case 'G': + WE.w.b[BYTES_IN_WORD-1]|=2; + inEntry++; + break; + case 'T': + WE.w.b[BYTES_IN_WORD-1]|=3; + inEntry++; + break; + default : + inEntry=0; NoACGT++; break; + } + index++; + Tot++; + if(inEntry>=(unsigned long)WORD_SIZE){ + WE.pos=index-WORD_SIZE; + NW++; + fwrite(&WE,sizeof(wentry),1,f2); + } + c = buffered_fgetc(seq, &i, &r, f); + } + + fclose(f); + +} + +int main(int ac, char** av){ + if(ac!=3){ + terror("USE: words seqFile.IN words.OUT"); + } + main_FILE(av[1], av[2]); + return 0; +} + diff -r 000000000000 -r 9db88f0f32b7 gecko/src/wordsStat.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/wordsStat.c Sat Dec 15 17:58:58 2018 -0500 @@ -0,0 +1,47 @@ +/* wordsStat.c + + This program shows the position of each word and the + sequence it corresponds to + -----------------------------------------------------12.Nov.2012 + oscart @ uma.es + */ + +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "dictionaryFunctions.h" + +int main(int ac, char** av) { + FILE *f; + uint64_t i = 0; + wentry wordEntry; + char wordString[33]; + wordString[32] = '\0'; + + if (ac != 2) + terror("USE: wordsStat words"); + + if ((f = fopen(av[1], "rt")) == NULL) + terror("opening words file"); + + if(fread(&wordEntry, sizeof(wentry), 1, f)!=1){ + terror("Empty words file"); + } + while (!feof(f)) { + showWord(&wordEntry.w, wordString); + fprintf(stdout, "Words(%" PRIu64 "):%s Pos: %" PRIu64 " Seq: %" PRIu64 "\n", i, wordString, + wordEntry.pos, wordEntry.seq); + if(fread(&wordEntry, sizeof(wentry), 1, f)!=1){ + if(ferror(f))terror("Error reading words file"); + } + i++; + } + + fclose(f); + + return 0; +} +