# HG changeset patch # User bitlab # Date 1605686905 0 # Node ID aec70bb1ae272625ce141d2c0901220fa44b8c04 # Parent dbdb10e4c8b54e2046123c248ebea23b4a8e3505 Uploaded diff -r dbdb10e4c8b5 -r aec70bb1ae27 gecko/bin/frags2align.sh --- a/gecko/bin/frags2align.sh Tue Nov 17 13:04:23 2020 +0000 +++ b/gecko/bin/frags2align.sh Wed Nov 18 08:08:25 2020 +0000 @@ -3,50 +3,46 @@ if [ $# != 4 ]; then echo " ==== ERROR ... you called this script inappropriately." echo "" - echo " usage: $0 fragsFILE.frags fastaX fastaY alignments.txt" + echo " usage: $0 fragsFILE.frags/.csv fastaX fastaY alignments.txt" echo "" exit -1 fi - - -MYRAND=$((( RANDOM % 10000000) +1)) -MGDIR=${PWD}/${MYRAND} - -mkdir $MGDIR - FRAGS=$1 FASTAX=$2 FASTAY=$3 ALIGN=$4 -fragsName=$(basename "$1") -fastaXname=$(basename "$2") -fastaYname=$(basename "$3") -alignName=$(basename "$4") -fragsName="${fragsName%.*}" -fastaXname="${fastaXname%.*}" -fastaYname="${fastaYname%.*}" -alignName="${alignName%.*}" -cp $1 $MGDIR/${fragsName}.frags -cp $2 $MGDIR/${fastaXname}.fasta -cp $3 $MGDIR/${fastaYname}.fasta BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # generate indices -$BINDIR/indexmaker $MGDIR/$fastaXname.fasta $MGDIR/$fastaXname.idx -$BINDIR/indexmaker $MGDIR/$fastaYname.fasta $MGDIR/$fastaYname.idx +$BINDIR/indexmaker $FASTAX $FASTAX.idx +$BINDIR/indexmaker $FASTAY $FASTAY.idx -$BINDIR/reverseComplement $MGDIR/$fastaYname.fasta $MGDIR/$fastaYname.rev -$BINDIR/indexmaker $MGDIR/$fastaYname.rev $MGDIR/$fastaYname.rev.idx +$BINDIR/reverseComplement $FASTAY $FASTAY.rev +$BINDIR/indexmaker $FASTAY.rev $FASTAY.rev.idx # extract frags -$BINDIR/frags2text $MGDIR/$fragsName.frags $MGDIR/$fastaXname.fasta $MGDIR/$fastaYname.fasta $MGDIR/$fastaYname.rev $MGDIR/$fastaXname.idx $MGDIR/$fastaYname.idx $MGDIR/$fastaYname.rev.idx $MGDIR/$alignName.txt +if [ ${FRAGS: -4} == ".csv" ] +then + + echo "....using csv" + sed 's/,/ /g' $FRAGS > $FRAGS.fix + $BINDIR/csvFrags2text $FRAGS.fix $FASTAX $FASTAY $FASTAY.rev $FASTAX.idx $FASTAY.idx $FASTAY.rev.idx $ALIGN + rm $FRAGS.fix -mv $MGDIR/$alignName.txt $ALIGN +else + echo "....using frags" + $BINDIR/frags2text $FRAGS $FASTAX $FASTAY $FASTAY.rev $FASTAX.idx $FASTAY.idx $FASTAY.rev.idx $ALIGN +fi + -rm -rf $MGDIR +rm $FASTAX.idx +rm $FASTAY.idx +rm $FASTAY.rev +rm $FASTAY.rev.idx + diff -r dbdb10e4c8b5 -r aec70bb1ae27 gecko/bin/frags2borders.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/bin/frags2borders.sh Wed Nov 18 08:08:25 2020 +0000 @@ -0,0 +1,53 @@ +#!/bin/bash + +if [ "$#" -lt 5 ]; then + echo " ==== ERROR ... you called this script inappropriately." + echo "" + echo " usage: $0 fragsFILE.csv fastaX fastaY alignments.txt borderSize [query]" + echo " include "query" as ending parameter to extract also the query sequence as fasta file " + echo "" + exit -1 +fi + +FRAGS=$1 +FASTAX=$2 +FASTAY=$3 +ALIGN=$4 +BORDER=$5 +ONLYQUERYSEQ=$6 + + + + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# generate indices + +$BINDIR/indexmaker $FASTAX $FASTAX.idx +$BINDIR/indexmaker $FASTAY $FASTAY.idx + +$BINDIR/reverseComplement $FASTAY $FASTAY.rev +$BINDIR/indexmaker $FASTAY.rev $FASTAY.rev.idx + +# extract frags + +sed 's/,/ /g' $FRAGS > $FRAGS.fix + +$BINDIR/csvExtractBorders $FRAGS.fix $FASTAX $FASTAY $FASTAY.rev $FASTAX.idx $FASTAY.idx $FASTAY.rev.idx $ALIGN $BORDER + +if [[ $ONLYQUERYSEQ = "query" ]] +then + + grep '>\|X:' $ALIGN | awk 'BEGIN{n=0;} {if (substr($0,1,2) ~ "X:" ) { print substr($0,8);} else { printf(">S%d-%s\n", n, $0); n = n + 1; }}' > query.fasta + sed 's/>/#/g' query.fasta | sed 's/^#/>/' > query-$BORDER.fasta + rm query.fasta + +fi + + +rm $FRAGS.fix +rm $FASTAX.idx +rm $FASTAY.idx +rm $FASTAY.rev +rm $FASTAY.rev.idx + diff -r dbdb10e4c8b5 -r aec70bb1ae27 gecko/bin/workflow.sh --- a/gecko/bin/workflow.sh Tue Nov 17 13:04:23 2020 +0000 +++ b/gecko/bin/workflow.sh Wed Nov 18 08:08:25 2020 +0000 @@ -2,38 +2,28 @@ FL=1000 # frequency limit -if [ $# != 8 ]; then +if [ $# != 6 ]; then echo " ==== ERROR ... you called this script inappropriately." echo "" - echo " usage: $0 seqXName seqYName lenght similarity WL fixedL output.frags output.csv" + echo " usage: $0 seqXName seqYName lenght similarity WL fixedL" echo "" exit -1 fi - { - -MYRAND=$((( RANDOM % 10000000) +1)) -MGDIR=${PWD}/${MYRAND} -echo "MGDIR $MGDIR" -mkdir -p ${MGDIR} - -genoXname=$(basename "$1") -genoYname=$(basename "$2") -genoXname="${genoXname%.*}" -genoYname="${genoYname%.*}" +dirNameX=$(readlink -f $1 | xargs dirname) +seqXName=$(basename "$1") +extensionX="${seqXName##*.}" +seqXName="${seqXName%.*}" +dirNameY=$(readlink -f $2 | xargs dirname) +seqYName=$(basename "$2") +extensionY="${seqYName##*.}" +seqYName="${seqYName%.*}" -cp $1 $MGDIR/${genoXname}.fasta -cp $2 $MGDIR/${genoYname}.fasta -mkdir -p ${MGDIR}/dictionaries -mkdir -p ${MGDIR}/fragments - -genoXExt="fasta" -genoYExt="fasta" - - +#seqXName=`basename $1 .fasta` +#seqYName=`basename $2 .fasta` BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" @@ -41,42 +31,40 @@ similarity=${4} WL=${5} # wordSize fixedL=${6} -output=${7} -csv=${8} +strand=${7} -mkdir ${MGDIR}/intermediateFiles +mkdir intermediateFiles -mkdir ${MGDIR}/intermediateFiles/${genoXname}-${genoYname} -mkdir ${MGDIR}/results -mkdir ${MGDIR}/intermediateFiles/dictionaries -mkdir ${MGDIR}/intermediateFiles/hits +mkdir intermediateFiles/${seqXName}-${seqYName} +mkdir results +mkdir intermediateFiles/dictionaries +mkdir intermediateFiles/hits # Copiamos los fastas -ln -s ${MGDIR}/${genoXname}.${genoXname} ${MGDIR}/intermediateFiles/${genoXname}-${genoYname} -ln -s ${MGDIR}/${genoYname}.${genoYname} ${MGDIR}/intermediateFiles/${genoYname}-${genoXname} +ln -s ${dirNameX}/${seqXName}.${extensionX} intermediateFiles/${seqXName}-${seqYName} +ln -s ${dirNameY}/${seqYName}.${extensionY} intermediateFiles/${seqXName}-${seqYName} -cd ${MGDIR}/intermediateFiles/${genoXname}-${genoYname} +cd intermediateFiles/${seqXName}-${seqYName} ############### - -echo "${BINDIR}/reverseComplement ${MGDIR}/${genoYname}.${genoXExt} ${genoYname}-revercomp.${genoYExt}" -${BINDIR}/reverseComplement ${MGDIR}/${genoYname}.${genoYExt} ${MGDIR}/${genoYname}-revercomp.${genoYExt} +echo "${BINDIR}/reverseComplement ${seqYName}.${extensionX} ${seqYName}-revercomp.${extensionY}" +${BINDIR}/reverseComplement ${seqYName}.${extensionX} ${seqYName}-revercomp.${extensionY} -if [[ ! -f ../dictionaries/${genoXname}.d2hP ]]; then - echo "${BINDIR}/dictionary.sh ${MGDIR}/${genoXname}.${genoXExt} &" - ${BINDIR}/dictionary.sh ${MGDIR}/${genoXname}.${genoXExt} & -fi - -if [[ ! -f ../dictionaries/${seqYName}.d2hP ]]; then - echo "${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}.${genoYExt} &" - ${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}.${genoYExt} & +if [[ ! -f ../dictionaries/${seqXName}.d2hP ]]; then + echo "${BINDIR}/dictionary.sh ${seqXName}.${extensionX} &" + ${BINDIR}/dictionary.sh ${seqXName}.${extensionX} & fi -if [[ ! -f ../dictionaries/${genoYname}-revercomp.d2hP ]]; then - echo "${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}-revercomp.${genoYExt} &" - ${BINDIR}/dictionary.sh ${MGDIR}/${genoYname}-revercomp.${genoYExt} & +if [[ ! -f ../dictionaries/${seqYName}.d2hP ]]; then + echo "${BINDIR}/dictionary.sh ${seqYName}.${extensionY} &" + ${BINDIR}/dictionary.sh ${seqYName}.${extensionY} & +fi + +if [[ ! -f ../dictionaries/${seqYName}-revercomp.d2hP ]]; then + echo "${BINDIR}/dictionary.sh ${seqYName}-revercomp.${extensionY} &" + ${BINDIR}/dictionary.sh ${seqYName}-revercomp.${extensionY} & fi echo "Waiting for the calculation of the dictionaries" @@ -88,31 +76,28 @@ done - -mv ${genoXname}.d2hP ../dictionaries/ -mv ${genoXname}.d2hW ../dictionaries/ -mv ${genoYname}.d2hP ../dictionaries/ -mv ${genoYname}.d2hW ../dictionaries/ -mv ${genoYname}-revercomp.d2hP ../dictionaries/ -mv ${genoYname}-revercomp.d2hW ../dictionaries/ - - +mv ${seqXName}.d2hP ../dictionaries/ +mv ${seqXName}.d2hW ../dictionaries/ +mv ${seqYName}.d2hP ../dictionaries/ +mv ${seqYName}.d2hW ../dictionaries/ +mv ${seqYName}-revercomp.d2hP ../dictionaries/ +mv ${seqYName}-revercomp.d2hW ../dictionaries/ # Hacemos enlace simbolico -ln -s ../dictionaries/${genoXname}.d2hP . -ln -s ../dictionaries/${genoXname}.d2hW . +ln -s ../dictionaries/${seqXName}.d2hP . +ln -s ../dictionaries/${seqXName}.d2hW . -ln -s ../dictionaries/${genoYname}.d2hP . -ln -s ../dictionaries/${genoYname}.d2hW . +ln -s ../dictionaries/${seqYName}.d2hP . +ln -s ../dictionaries/${seqYName}.d2hW . -ln -s ../dictionaries/${genoYname}-revercomp.d2hP . -ln -s ../dictionaries/${genoYname}-revercomp.d2hW . +ln -s ../dictionaries/${seqYName}-revercomp.d2hP . +ln -s ../dictionaries/${seqYName}-revercomp.d2hW . -echo "${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} f &" -${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} f & +echo "${BINDIR}/comparison.sh ${seqXName}.${extensionX} ${seqYName}.${extensionY} ${length} ${similarity} ${WL} ${fixedL} f &" +${BINDIR}/comparison.sh ${seqXName}.${extensionX} ${seqYName}.${extensionY} ${length} ${similarity} ${WL} ${fixedL} f & -echo "${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}-revercomp.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} r &" -${BINDIR}/comparison.sh ${MGDIR}/${genoXname}.${genoXExt} ${MGDIR}/${genoYname}-revercomp.${genoYExt} ${length} ${similarity} ${WL} ${fixedL} r & +echo "${BINDIR}/comparison.sh ${seqXName}.${extensionX} ${seqYName}-revercomp.${extensionY} ${length} ${similarity} ${WL} ${fixedL} r &" +${BINDIR}/comparison.sh ${seqXName}.${extensionX} ${seqYName}-revercomp.${extensionY} ${length} ${similarity} ${WL} ${fixedL} r & echo "Waiting for the comparisons" @@ -122,47 +107,45 @@ wait $job done - #echo "rm ${seqYName}-revercomp.${extensionY}" #rm ${seqYName}-revercomp.${extensionY} +echo "${BINDIR}/combineFrags ${seqXName}-${seqYName}-sf.frags ${seqXName}-${seqYName}-revercomp-sr.frags ${seqXName}-${seqYName}.frags" +${BINDIR}/combineFrags ${seqXName}-${seqYName}-sf.frags ${seqXName}-${seqYName}-revercomp-sr.frags ${seqXName}-${seqYName}.frags -echo "${BINDIR}/combineFrags ${genoXname}-${genoYname}-sf.frags ${genoXname}-${genoYname}-revercomp-sr.frags ${genoXname}-${genoYname}.frags" -${BINDIR}/combineFrags ${genoXname}-${genoYname}-sf.frags ${genoXname}-${genoYname}-revercomp-sr.frags ${genoXname}-${genoYname}.frags +#echo "${BINDIR}/newFragToBalazsVersion ${seqXName}-${seqYName}.frags ${seqXName}-${seqYName}.old.frags" +#${BINDIR}/newFragToBalazsVersion ${seqXName}-${seqYName}.frags ${seqXName}-${seqYName}.old.frags + +#echo "${BINDIR}/af2pngrev ${seqXName}-${seqYName}.frags ${seqXName}-${seqYName}.png ${seqXName} ${seqYName}" +#${BINDIR}/af2pngrev ${seqXName}-${seqYName}.frags ${seqXName}-${seqYName}.png ${seqXName} ${seqYName} #Borramos todo menos los frags y los diccionarios # Get Info from frags -echo "${BINDIR}/getInfo ${genoXname}-${genoYname}.frags > ${genoXname}-${genoYname}.csv" -${BINDIR}/getInfo ${genoXname}-${genoYname}.frags > ${genoXname}-${genoYname}.csv.tmp -cat ${genoXname}-${genoYname}.frags.INF ${genoXname}-${genoYname}.csv.tmp > ${genoXname}-${genoYname}.csv -rm -rf ${genoXname}-${genoYname}.csv.tmp +echo "${BINDIR}/getInfo ${seqXName}-${seqYName}.frags > ${seqXName}-${seqYName}.csv" +${BINDIR}/getInfo ${seqXName}-${seqYName}.frags > ${seqXName}-${seqYName}.csv.tmp +cat ${seqXName}-${seqYName}.frags.INF ${seqXName}-${seqYName}.csv.tmp > ${seqXName}-${seqYName}.csv +rm -rf ${seqXName}-${seqYName}.csv.tmp -if [[ -L "../../${genoXname}.fasta" ]] +if [[ -L "../../${seqXName}.fasta" ]] then - rm ../../${genoYname}.fasta + rm ../../${seqXName}.fasta fi -if [[ -L "../../${genoXname}.fasta" ]] +if [[ -L "../../${seqYName}.fasta" ]] then - rm ../../${genoYname}.fasta + rm ../../${seqYName}.fasta fi #Movemos los frags y los info -mv ${genoXname}-${genoYname}.frags $output -mv ${genoXname}-${genoYname}.csv $csv - - +mv ${seqXName}-${seqYName}.frags ../../results +mv ${seqXName}-${seqYName}.frags.INF ../../results +mv ${seqXName}-${seqYName}.frags.MAT ../../results +#mv ${seqXName}-${seqYName}.old.frags ../../results +mv ${seqXName}-${seqYName}.csv ../../results #echo "Borrando ${seqXName}-${seqYName}" cd .. - #rm -rf ${seqXName}-${seqYName} -cd .. - - -rm -r ${MGDIR} - } &> /dev/null - diff -r dbdb10e4c8b5 -r aec70bb1ae27 gecko/src/Makefile --- a/gecko/src/Makefile Tue Nov 17 13:04:23 2020 +0000 +++ b/gecko/src/Makefile Wed Nov 18 08:08:25 2020 +0000 @@ -3,7 +3,7 @@ CFLAGS=-O3 -D_FILE_OFFSET_BITS=64 -Wall BIN=../bin -all: reverseComplement words sortWords w2hd wordsStat hits sortHits filterHits FragHits combineFrags hitsStat hdStat fragStat getInfoCSB filterFrags indexmaker frags2text +all: reverseComplement words sortWords w2hd wordsStat hits sortHits filterHits FragHits combineFrags hitsStat hdStat fragStat getInfoCSB filterFrags indexmaker frags2text csvFrags2text csvExtractBorders hits: hits.c $(CC) $(CFLAGS) hits.c comparisonFunctions.c dictionaryFunctions.c commonFunctions.c -o $(BIN)/hits @@ -56,6 +56,12 @@ frags2text: $(CC) $(CFLAGS) frags2text.c fragmentv2.c commonFunctions.c comparisonFunctions.c -lm -o $(BIN)/frags2text +csvFrags2text: + $(CC) $(CFLAGS) csvFrags2text.c fragmentv2.c commonFunctions.c comparisonFunctions.c -lm -o $(BIN)/csvFrags2text + +csvExtractBorders: + $(CC) $(CFLAGS) csvExtractBorders.c fragmentv2.c commonFunctions.c comparisonFunctions.c -lm -o $(BIN)/csvExtractBorders + clean: rm -rf $(BIN)/getInfo rm -rf $(BIN)/filterFrags @@ -63,3 +69,4 @@ rm -rf $(BIN)/frags2text rm -rf $(BIN)/reverseComplement $(BIN)/words $(BIN)/sortWords $(BIN)/w2hd $(BIN)/wordsStat rm -rf $(BIN)/hits $(BIN)/sortHits $(BIN)/filterHits $(BIN)/FragHits $(BIN)/combineFrags $(BIN)/hitsStat $(BIN)/fragStat $(BIN)/hdStat + rm -rf $(BIN)/csvExtractBorders $(BIN)/csvFrags2text.c diff -r dbdb10e4c8b5 -r aec70bb1ae27 gecko/src/csvExtractBorders.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/csvExtractBorders.c Wed Nov 18 08:08:25 2020 +0000 @@ -0,0 +1,396 @@ +/* + * + * Sintax: ./frags2text fragsFILE.frags fastaX fastaY fragsFILE.txt + + */ + +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "comparisonFunctions.h" + +#define TAB_INSERT 70 +#define READING_FRAG_BUFFER 10000 + +#define MAX(a,b) \ + ({ __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a > _b ? _a : _b; }) + +#define MIN(a,b) \ + ({ __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a < _b ? _a : _b; }) + +void csv_frag_to_struct_frag(char * l, struct FragFile * f){ + + //0 1 2 3 4 5 6 7 8 9 10 11 12 13 + //Type xStart yStart xEnd yEnd strand(f/r) block length score ident similarity %ident SeqX SeqY + //Frag 3147493 3006054 3154663 2998884 f 0 7171 20868 6194 72.75 0.86 0 0 + + + float bin; + + sscanf(l, "%*s %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %c %"PRId64" %"PRIu64" %"PRIu64" %"PRIu64" %f %f %"PRIu64" %"PRIu64, &f->xStart, &f->yStart, &f->xEnd, &f->yEnd, &f->strand, &f->block, &f->length, &f->score, &f->ident, &f->similarity, &bin, &f->seqX, &f->seqY); + + //printf("Read %d items\n", items); + +} + +struct rIndex2 * loadReadsIndex(char * filename, uint64_t * nReads){ + struct rIndex2 * RR; + uint64_t nR=0,i; + FILE *f; + uint64_t fsize; + + if ((f=fopen(filename,"rb"))==NULL) terror("Could not open index input file"); + + fseeko(f, 0, SEEK_END); + fsize = ftello(f); + rewind(f); + nR = fsize/sizeof(struct rIndex2); + + if ((RR =(struct rIndex2*) calloc(nR,sizeof(struct rIndex2)))==NULL) terror("Could not allocate index"); + + for (i=0; ilength, f->ident, f->strand, f->xStart, f->yStart); +} + +void get_both_seqs(char * fastaX, char * fastaY, uint64_t iniX, uint64_t finX, uint64_t iniY, uint64_t finY, uint64_t posX, uint64_t posY, uint64_t LacX, uint64_t LacY, uint64_t lX, uint64_t lY, FILE * fO){ + char copyX[TAB_INSERT+1]; + char copyY[TAB_INSERT+1]; + memset(copyX, 0x0, TAB_INSERT+1); + memset(copyY, 0x0, TAB_INSERT+1); + uint64_t atcopyX = 0; + uint64_t atcopyY = 0; + uint64_t i; + + uint64_t pos_used_x, pos_used_y; + + // The X one + //fseek(fastaX, posX, SEEK_SET); + pos_used_x = posX; + char cX = fastaX[pos_used_x++]; + while(cX != '\n'){ cX = fastaX[pos_used_x++]; } + uint64_t gpX = iniX - LacX; + uint64_t currX = 0, tab_counter; + while(currX < gpX){ + cX = fastaX[pos_used_x++]; + if(cX != '\n') ++currX; + } + + // the other Y + //fseek(fastaY, posY, SEEK_SET); + pos_used_y = posY; + char cY = fastaY[pos_used_y++]; + while(cY != '\n'){ cY = fastaY[pos_used_y++]; } + uint64_t gpY = iniY - LacY; + uint64_t currY = 0; + while(currY < gpY){ + cY = fastaY[pos_used_y++]; + if(cY != '\n') ++currY; + } + + + + + // Reached the region to show + currX = 0; + currY = 0; + cX = fastaX[pos_used_x++]; + cY = fastaY[pos_used_y++]; + //fprintf(fO, "\t"); + tab_counter = 0; + while(currX < lX && currY < lY){ + + if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } + cX = fastaX[pos_used_x++]; + while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; + + + if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } + cY = fastaY[pos_used_y++]; + while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; + + while(currX > currY){ + if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } + cY = fastaY[pos_used_y++]; + while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; + } + while(currX < currY){ + if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } + cX = fastaX[pos_used_x++]; + while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; + } + + ++tab_counter; + + + + if(tab_counter >= TAB_INSERT && currX == currY){ + + copyX[TAB_INSERT] = '\0'; + copyY[TAB_INSERT] = '\0'; + fprintf(fO, "X:\t%.*s\n\t", TAB_INSERT, copyX); + for(i=0; i 0){ + copyX[atcopyX] = '\0'; + copyY[atcopyY] = '\0'; + fprintf(fO, "X:\t%.*s\n\t", (int)atcopyX, copyX); + for(i=0; i + //RI[id].Lac is the sum of the reads length prior + + if(frag.strand == 'f'){ + write_headers(fX, fY, fO, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, &frag); + }else{ + write_headers(fX, fYrev, fO, RI_X[frag.seqX].pos, RI_Yrev[(nReads_Y - frag.seqY) - 1].pos, &frag); + } + + + //get_seq_from_to(fX, fO, frag.xStart, frag.xEnd, RI_X[frag.seqX].pos, RI_X[frag.seqX].Lac, frag.seqX, frag.length, fO); + + + + if(frag.strand == 'f'){ + + int64_t leftx = MAX(0, (int64_t)frag.xStart - (int64_t)border_size); + int64_t rightx = MIN(xlen, (int64_t)frag.xEnd + (int64_t)border_size); + int64_t lefty = MAX(0, (int64_t)frag.yStart - (int64_t)border_size); + int64_t righty = MIN(ylen, (int64_t)frag.yEnd + (int64_t) border_size); + + int64_t border_left_size = MIN(frag.xStart - leftx, frag.yStart - lefty); + int64_t border_right_size = MIN(rightx - frag.xEnd, righty - frag.yEnd); + + get_both_seqs(strfastaX, strfastaY, frag.xStart - border_left_size, frag.xEnd + border_right_size, frag.yStart - border_left_size, frag.yEnd + border_right_size, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, RI_X[frag.seqX].Lac, RI_Y[frag.seqY].Lac, frag.length + border_left_size + border_right_size, frag.length + border_left_size + border_right_size, fO); + //get_seq_from_to(fY, fO, frag.yStart, frag.yEnd, RI_Y[frag.seqY].pos, RI_Y[frag.seqY].Lac, frag.seqY, frag.length, fO); + }else{ + uint64_t seqYnew; + seqYnew = (nReads_Y - frag.seqY) - 1; + + frag.yStart = ylen - frag.yStart - 1; + frag.yEnd = ylen - frag.yEnd - 1; + + int64_t leftx = MAX(0, (int64_t)frag.xStart - (int64_t)border_size); + int64_t rightx = MIN(xlen, (int64_t)frag.xEnd + (int64_t)border_size); + int64_t lefty = MAX(0, (int64_t)frag.yStart - (int64_t)border_size); + int64_t righty = MIN(ylen, (int64_t)frag.yEnd + (int64_t) border_size); + + int64_t border_left_size = MIN(frag.xStart - leftx, frag.yStart - lefty); + int64_t border_right_size = MIN(rightx - frag.xEnd, righty - frag.yEnd); + + + + get_both_seqs(strfastaX, strfastaYrev, frag.xStart - border_left_size, frag.xEnd + border_right_size, frag.yStart - border_left_size, frag.yEnd + border_right_size, RI_X[frag.seqX].pos, RI_Yrev[seqYnew].pos, RI_X[frag.seqX].Lac, RI_Yrev[seqYnew].Lac, frag.length + border_left_size + border_right_size, frag.length + border_left_size + border_right_size, fO); + //get_seq_from_to_rev(fYrev, fO, frag.yStart, frag.yEnd, RI_Yrev[seqYnew].pos, RI_Yrev[seqYnew].Lac, seqYnew, frag.length, fO); + } + + //readFragment(&frag, fFrags); + if(!feof(fFrags)){ + fgets(buffer, READING_FRAG_BUFFER, fFrags); + csv_frag_to_struct_frag(buffer, &frag); + + + }else{ + exit = 1; + } + //printf("line read: %s\n", buffer); + //printf("HI im a frag: %"PRIu64", %"PRIu64" - %"PRIu64", %"PRIu64"\n", frag.xStart, frag.xEnd, frag.yStart, frag.yEnd); + //getchar(); + } + + fclose(fFrags); + fclose(fX); + fclose(fY); + fclose(fO); + + free(RI_X); + free(RI_Y); + free(RI_Yrev); + + free(strfastaX); + free(strfastaY); + free(strfastaYrev); + + return 0; +} diff -r dbdb10e4c8b5 -r aec70bb1ae27 gecko/src/csvFrags2text.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gecko/src/csvFrags2text.c Wed Nov 18 08:08:25 2020 +0000 @@ -0,0 +1,347 @@ +/* + * + * Sintax: ./frags2text fragsFILE.frags fastaX fastaY fragsFILE.txt + + */ + +#include +#include +#include +#include +#include "structs.h" +#include "commonFunctions.h" +#include "comparisonFunctions.h" + +#define TAB_INSERT 70 +#define READING_FRAG_BUFFER 10000 + +void csv_frag_to_struct_frag(char * l, struct FragFile * f){ + + //0 1 2 3 4 5 6 7 8 9 10 11 12 13 + //Type xStart yStart xEnd yEnd strand(f/r) block length score ident similarity %ident SeqX SeqY + //Frag 3147493 3006054 3154663 2998884 f 0 7171 20868 6194 72.75 0.86 0 0 + + + float bin; + + sscanf(l, "%*s %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %c %"PRId64" %"PRIu64" %"PRIu64" %"PRIu64" %f %f %"PRIu64" %"PRIu64, &f->xStart, &f->yStart, &f->xEnd, &f->yEnd, &f->strand, &f->block, &f->length, &f->score, &f->ident, &f->similarity, &bin, &f->seqX, &f->seqY); + + //printf("Read %d items\n", items); + +} + +struct rIndex2 * loadReadsIndex(char * filename, uint64_t * nReads){ + struct rIndex2 * RR; + uint64_t nR=0,i; + FILE *f; + uint64_t fsize; + + if ((f=fopen(filename,"rb"))==NULL) terror("Could not open index input file"); + + fseeko(f, 0, SEEK_END); + fsize = ftello(f); + rewind(f); + nR = fsize/sizeof(struct rIndex2); + + if ((RR =(struct rIndex2*) calloc(nR,sizeof(struct rIndex2)))==NULL) terror("Could not allocate index"); + + for (i=0; ilength, f->ident, f->strand, f->xStart, f->yStart); +} + +void get_both_seqs(char * fastaX, char * fastaY, uint64_t iniX, uint64_t finX, uint64_t iniY, uint64_t finY, uint64_t posX, uint64_t posY, uint64_t LacX, uint64_t LacY, uint64_t lX, uint64_t lY, FILE * fO){ + char copyX[TAB_INSERT+1]; + char copyY[TAB_INSERT+1]; + memset(copyX, 0x0, TAB_INSERT+1); + memset(copyY, 0x0, TAB_INSERT+1); + uint64_t atcopyX = 0; + uint64_t atcopyY = 0; + uint64_t i; + + uint64_t pos_used_x, pos_used_y; + + // The X one + //fseek(fastaX, posX, SEEK_SET); + pos_used_x = posX; + char cX = fastaX[pos_used_x++]; + while(cX != '\n'){ cX = fastaX[pos_used_x++]; } + uint64_t gpX = iniX - LacX; + uint64_t currX = 0, tab_counter; + while(currX < gpX){ + cX = fastaX[pos_used_x++]; + if(cX != '\n') ++currX; + } + + // the other Y + //fseek(fastaY, posY, SEEK_SET); + pos_used_y = posY; + char cY = fastaY[pos_used_y++]; + while(cY != '\n'){ cY = fastaY[pos_used_y++]; } + uint64_t gpY = iniY - LacY; + uint64_t currY = 0; + while(currY < gpY){ + cY = fastaY[pos_used_y++]; + if(cY != '\n') ++currY; + } + + + + + // Reached the region to show + currX = 0; + currY = 0; + cX = fastaX[pos_used_x++]; + cY = fastaY[pos_used_y++]; + //fprintf(fO, "\t"); + tab_counter = 0; + while(currX < lX && currY < lY){ + + if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } + cX = fastaX[pos_used_x++]; + while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; + + + if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } + cY = fastaY[pos_used_y++]; + while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; + + while(currX > currY){ + if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; } + cY = fastaY[pos_used_y++]; + while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++]; + } + while(currX < currY){ + if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; } + cX = fastaX[pos_used_x++]; + while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++]; + } + + ++tab_counter; + + + + if(tab_counter >= TAB_INSERT && currX == currY){ + + copyX[TAB_INSERT] = '\0'; + copyY[TAB_INSERT] = '\0'; + fprintf(fO, "X:\t%.*s\n\t", TAB_INSERT, copyX); + for(i=0; i 0){ + copyX[atcopyX] = '\0'; + copyY[atcopyY] = '\0'; + fprintf(fO, "X:\t%.*s\n\t", (int)atcopyX, copyX); + for(i=0; i + //RI[id].Lac is the sum of the reads length prior + + if(frag.strand == 'f'){ + write_headers(fX, fY, fO, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, &frag); + }else{ + write_headers(fX, fYrev, fO, RI_X[frag.seqX].pos, RI_Yrev[(nReads_Y - frag.seqY) - 1].pos, &frag); + } + + + //get_seq_from_to(fX, fO, frag.xStart, frag.xEnd, RI_X[frag.seqX].pos, RI_X[frag.seqX].Lac, frag.seqX, frag.length, fO); + + + + if(frag.strand == 'f'){ + get_both_seqs(strfastaX, strfastaY, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, RI_X[frag.seqX].Lac, RI_Y[frag.seqY].Lac, frag.length, frag.length, fO); + //get_seq_from_to(fY, fO, frag.yStart, frag.yEnd, RI_Y[frag.seqY].pos, RI_Y[frag.seqY].Lac, frag.seqY, frag.length, fO); + }else{ + uint64_t seqYnew; + seqYnew = (nReads_Y - frag.seqY) - 1; + get_both_seqs(strfastaX, strfastaYrev, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Yrev[seqYnew].pos, RI_X[frag.seqX].Lac, RI_Yrev[seqYnew].Lac, frag.length, frag.length, fO); + //get_seq_from_to_rev(fYrev, fO, frag.yStart, frag.yEnd, RI_Yrev[seqYnew].pos, RI_Yrev[seqYnew].Lac, seqYnew, frag.length, fO); + } + + //readFragment(&frag, fFrags); + if(!feof(fFrags)){ + fgets(buffer, READING_FRAG_BUFFER, fFrags); + csv_frag_to_struct_frag(buffer, &frag); + + + }else{ + exit = 1; + } + //printf("line read: %s\n", buffer); + //printf("HI im a frag: %"PRIu64", %"PRIu64" - %"PRIu64", %"PRIu64"\n", frag.xStart, frag.xEnd, frag.yStart, frag.yEnd); + //getchar(); + } + + fclose(fFrags); + fclose(fX); + fclose(fY); + fclose(fO); + + free(RI_X); + free(RI_Y); + free(RI_Yrev); + + free(strfastaX); + free(strfastaY); + free(strfastaYrev); + + return 0; +} diff -r dbdb10e4c8b5 -r aec70bb1ae27 gecko/src/reverseComplement.c --- a/gecko/src/reverseComplement.c Tue Nov 17 13:04:23 2020 +0000 +++ b/gecko/src/reverseComplement.c Wed Nov 18 08:08:25 2020 +0000 @@ -15,7 +15,7 @@ #include #include "commonFunctions.h" -#define SEQSIZE 500000000 +#define SEQSIZE 2000000000 #define WSIZE 32 #define NREADS 1000000