Next changeset 1:7fff03d777dd (2018-12-17) |
Commit message:
Uploaded |
added:
IMSAME/IMSAME.xml IMSAME/LICENSE IMSAME/bin/IMSAME IMSAME/bin/all_vs_all_metagenomes_IMSAME.sh IMSAME/bin/combine IMSAME/bin/revComp IMSAME/src/IMSAME.c IMSAME/src/Makefile IMSAME/src/alignmentFunctions.c IMSAME/src/alignmentFunctions.h IMSAME/src/combine_reads.c IMSAME/src/commonFunctions.c IMSAME/src/commonFunctions.h IMSAME/src/covident.c IMSAME/src/reverseComplement.c IMSAME/src/structs.h |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/IMSAME.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/IMSAME.xml Sat Dec 15 18:04:10 2018 -0500 |
b |
@@ -0,0 +1,15 @@ +<tool id="IMSAME" name="IMSAME"> + <description>A Pairwise Incremental Multi-Staged Alignment Method for Metagenomes Comparison</description> + <inputs> + <param name="query" type="data" format="fasta" label="Query metagenome" help="Query sequence file in fasta format" /> + <param name="db" type="data" format="fasta" label="Reference metagenome" help="Reference sequence file in fasta format" /> + <param name="evalue" type="text" size="30" value="10e-10" label="Evalue" help="Evalue for filtering HSPs" /> + <param name="coverage" type="float" value="0.5" label="Coverage" help="Minimum coverage to accept an alignment" /> + <param name="identity" type="float" value="0.5" label="Identity" help="Minimum identity to accept an alignment" /> + <param name="kmer" type="integer" value="12" label="Word size" help="Word size for heuristicly exploring the search space" /> + </inputs> + <command>/home/galaxy-bitlab/galaxy/tools/IMSAME/bin/IMSAME -query $query -db $db -evalue $evalue -coverage $coverage -identity $identity -kmer $kmer -out $output --full</command> + <outputs> + <data name="output" type="txt" label="Output alignments"/> + </outputs> +</tool> |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/LICENSE Sat Dec 15 18:04:10 2018 -0500 |
b |
b'@@ -0,0 +1,674 @@\n+ GNU GENERAL PUBLIC LICENSE\n+ Version 3, 29 June 2007\n+\n+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>\n+ Everyone is permitted to copy and distribute verbatim copies\n+ of this license document, but changing it is not allowed.\n+\n+ Preamble\n+\n+ The GNU General Public License is a free, copyleft license for\n+software and other kinds of works.\n+\n+ The licenses for most software and other practical works are designed\n+to take away your freedom to share and change the works. By contrast,\n+the GNU General Public License is intended to guarantee your freedom to\n+share and change all versions of a program--to make sure it remains free\n+software for all its users. We, the Free Software Foundation, use the\n+GNU General Public License for most of our software; it applies also to\n+any other work released this way by its authors. You can apply it to\n+your programs, too.\n+\n+ When we speak of free software, we are referring to freedom, not\n+price. Our General Public Licenses are designed to make sure that you\n+have the freedom to distribute copies of free software (and charge for\n+them if you wish), that you receive source code or can get it if you\n+want it, that you can change the software or use pieces of it in new\n+free programs, and that you know you can do these things.\n+\n+ To protect your rights, we need to prevent others from denying you\n+these rights or asking you to surrender the rights. Therefore, you have\n+certain responsibilities if you distribute copies of the software, or if\n+you modify it: responsibilities to respect the freedom of others.\n+\n+ For example, if you distribute copies of such a program, whether\n+gratis or for a fee, you must pass on to the recipients the same\n+freedoms that you received. You must make sure that they, too, receive\n+or can get the source code. And you must show them these terms so they\n+know their rights.\n+\n+ Developers that use the GNU GPL protect your rights with two steps:\n+(1) assert copyright on the software, and (2) offer you this License\n+giving you legal permission to copy, distribute and/or modify it.\n+\n+ For the developers\' and authors\' protection, the GPL clearly explains\n+that there is no warranty for this free software. For both users\' and\n+authors\' sake, the GPL requires that modified versions be marked as\n+changed, so that their problems will not be attributed erroneously to\n+authors of previous versions.\n+\n+ Some devices are designed to deny users access to install or run\n+modified versions of the software inside them, although the manufacturer\n+can do so. This is fundamentally incompatible with the aim of\n+protecting users\' freedom to change the software. The systematic\n+pattern of such abuse occurs in the area of products for individuals to\n+use, which is precisely where it is most unacceptable. Therefore, we\n+have designed this version of the GPL to prohibit the practice for those\n+products. If such problems arise substantially in other domains, we\n+stand ready to extend this provision to those domains in future versions\n+of the GPL, as needed to protect the freedom of users.\n+\n+ Finally, every program is threatened constantly by software patents.\n+States should not allow patents to restrict development and use of\n+software on general-purpose computers, but in those that do, we wish to\n+avoid the special danger that patents applied to a free program could\n+make it effectively proprietary. To prevent this, the GPL assures that\n+patents cannot be used to render the program non-free.\n+\n+ The precise terms and conditions for copying, distribution and\n+modification follow.\n+\n+ TERMS AND CONDITIONS\n+\n+ 0. Definitions.\n+\n+ "This License" refers to version 3 of the GNU General Public License.\n+\n+ "Copyright" also means copyright-like laws that apply to other kinds of\n+works, such as semiconductor masks.\n+\n+ "The Program" refers to a'..b'CE OF THE PROGRAM\n+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\n+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n+\n+ 16. Limitation of Liability.\n+\n+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\n+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\n+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\n+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\n+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\n+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\n+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\n+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\n+SUCH DAMAGES.\n+\n+ 17. Interpretation of Sections 15 and 16.\n+\n+ If the disclaimer of warranty and limitation of liability provided\n+above cannot be given local legal effect according to their terms,\n+reviewing courts shall apply local law that most closely approximates\n+an absolute waiver of all civil liability in connection with the\n+Program, unless a warranty or assumption of liability accompanies a\n+copy of the Program in return for a fee.\n+\n+ END OF TERMS AND CONDITIONS\n+\n+ How to Apply These Terms to Your New Programs\n+\n+ If you develop a new program, and you want it to be of the greatest\n+possible use to the public, the best way to achieve this is to make it\n+free software which everyone can redistribute and change under these terms.\n+\n+ To do so, attach the following notices to the program. It is safest\n+to attach them to the start of each source file to most effectively\n+state the exclusion of warranty; and each file should have at least\n+the "copyright" line and a pointer to where the full notice is found.\n+\n+ {one line to give the program\'s name and a brief idea of what it does.}\n+ Copyright (C) {year} {name of author}\n+\n+ This program is free software: you can redistribute it and/or modify\n+ it under the terms of the GNU General Public License as published by\n+ the Free Software Foundation, either version 3 of the License, or\n+ (at your option) any later version.\n+\n+ This program is distributed in the hope that it will be useful,\n+ but WITHOUT ANY WARRANTY; without even the implied warranty of\n+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+ GNU General Public License for more details.\n+\n+ You should have received a copy of the GNU General Public License\n+ along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\n+Also add information on how to contact you by electronic and paper mail.\n+\n+ If the program does terminal interaction, make it output a short\n+notice like this when it starts in an interactive mode:\n+\n+ {project} Copyright (C) {year} {fullname}\n+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w\'.\n+ This is free software, and you are welcome to redistribute it\n+ under certain conditions; type `show c\' for details.\n+\n+The hypothetical commands `show w\' and `show c\' should show the appropriate\n+parts of the General Public License. Of course, your program\'s commands\n+might be different; for a GUI interface, you would use an "about box".\n+\n+ You should also get your employer (if you work as a programmer) or school,\n+if any, to sign a "copyright disclaimer" for the program, if necessary.\n+For more information on this, and how to apply and follow the GNU GPL, see\n+<http://www.gnu.org/licenses/>.\n+\n+ The GNU General Public License does not permit incorporating your program\n+into proprietary programs. If your program is a subroutine library, you\n+may consider it more useful to permit linking proprietary applications with\n+the library. If this is what you want to do, use the GNU Lesser General\n+Public License instead of this License. But first, please read\n+<http://www.gnu.org/philosophy/why-not-lgpl.html>.\n' |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/bin/IMSAME |
b |
Binary file IMSAME/bin/IMSAME has changed |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/bin/all_vs_all_metagenomes_IMSAME.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/bin/all_vs_all_metagenomes_IMSAME.sh Sat Dec 15 18:04:10 2018 -0500 |
[ |
@@ -0,0 +1,67 @@ +#!/usr/bin/env bash +DIR=$1 +COV=$2 +SIM=$3 +THR=$4 +EXT=$5 +OUT=$6 + + +BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +array=() +x=0 + +echo "WARNING: FULL COMPARISON IS ENABLED" + +if [ $# != 6 ]; then + echo "***ERROR*** Use: $0 metagenomes_directory coverage similarity threads file_extension outpath" + exit -1 +fi + +for elem in $(ls -d $DIR/*.$EXT | awk -F "/" '{print $NF}' | awk -F ".$EXT" '{print $1}') +do + array[$x]=$elem + x=`expr $x + 1` + #echo "X: $elem" +done + +for ((i=0 ; i < ${#array[@]} ; i++)) +do + for ((j=i ; j < ${#array[@]} ; j++)) + do + if [ $i != $j ]; then + seqX=${array[$i]} + seqY=${array[$j]} + #echo "----------${seqX}-${seqY}-----------" + if [[ ! -f $6/${seqX}-${seqY}.align ]]; then #if the file does not exist + + + ${BINDIR}/IMSAME -query $DIR/${seqX}.$EXT -db $DIR/${seqY}.$EXT -n_threads $THR --full -coverage $COV -identity $SIM -out $6/${seqX}-${seqY}.align + grep "(" $6/${seqX}-${seqY}.align | awk '{print $4, $5, $6}' | sed 's/%//g' > $6/${seqX}-${seqY}.align.filtered + + ${BINDIR}/covident $6/${seqX}-${seqY}.align.filtered $6/${seqX}-${seqY}.align.filtered.mat + + fi + + + + + + if [[ ! -f $6/${seqX}-${seqY}.r.align ]]; then #if the reversed alignment does not exist + # Compute reverse complement + ${BINDIR}/revComp $DIR/${seqY}.$EXT $DIR/${seqY}.r.${EXT} + ${BINDIR}/IMSAME -query $DIR/${seqX}.$EXT -db $DIR/${seqY}.r.$EXT -n_threads $THR --full -coverage $COV -identity $SIM -out $6/${seqX}-${seqY}.r.align + grep "(" $6/${seqX}-${seqY}.r.align | awk '{print $4, $5, $6}' | sed 's/%//g' > $6/${seqX}-${seqY}.r.align.filtered + + ${BINDIR}/covident $6/${seqX}-${seqY}.r.align.filtered $6/${seqX}-${seqY}.align.filtered.mat + + fi + + + + fi + rm $DIR/${seqY}.r.${EXT} + done + +done |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/bin/combine |
b |
Binary file IMSAME/bin/combine has changed |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/bin/revComp |
b |
Binary file IMSAME/bin/revComp has changed |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/IMSAME.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/IMSAME.c Sat Dec 15 18:04:10 2018 -0500 |
[ |
b'@@ -0,0 +1,760 @@\n+/*********\n+\n+File IMSAME.c\n+Author EPW <estebanpw@uma.es>\n+Description Computes an incremental alignment on reads versus reads using n threads\n+\n+USAGE Usage is described by calling ./IMSAME --help\n+\n+\n+\n+**********/\n+\n+\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <math.h>\n+#include <string.h>\n+#include <ctype.h>\n+#include <pthread.h>\n+#include "structs.h"\n+#include "alignmentFunctions.h"\n+#include "commonFunctions.h"\n+\n+#define MAX(x, y) (((x) > (y)) ? (x) : (y))\n+#define MIN(x, y) (((x) <= (y)) ? (x) : (y))\n+#define STARTING_SEQS 1000\n+#define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes\n+\n+\n+uint64_t custom_kmer = 12; // Defined as external in structs.h\n+\n+void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts);\n+\n+int VERBOSE_ACTIVE = 0;\n+\n+int main(int argc, char ** av){\n+\n+\n+ \n+\n+\n+ clock_t begin, end;\n+ \n+ int error; //To tell if threads could not be launched\n+ uint64_t i,j;\n+\n+ //query to read kmers from, database to find seeds\n+ FILE * query = NULL, * database = NULL, * out_database = NULL;\n+ long double minevalue = 1/powl(10, 10); //Default 1 * 10^-10\n+ \n+ long double mincoverage = 0.5, minidentity = 0.5, window = 0.15; //Default\n+ int igap = -5, egap = -2;\n+ unsigned char full_comp = FALSE;\n+ unsigned char hits_only = FALSE;\n+\n+ uint64_t n_threads = 4;\n+ uint64_t n_parts = 3; // Default is 3\n+\n+ init_args(argc, av, &query, &database, &out_database, &n_threads, &minevalue, &mincoverage, &igap, &egap, &minidentity, &window, &full_comp, &custom_kmer, &hits_only, &n_parts);\n+ \n+ //uint64_t reads_per_thread;\n+ uint64_t sum_accepted_reads = 0;\n+\n+ begin = clock();\n+\n+ fprintf(stdout, "[INFO] Init. quick table\\n");\n+\n+ pthread_t * threads = (pthread_t *) malloc(n_threads * sizeof(pthread_t));\n+ if(threads == NULL) terror("Could not create threads");\n+\n+ pthread_t * loading_threads = (pthread_t *) malloc(FIXED_LOADING_THREADS * sizeof(pthread_t));\n+ if(loading_threads == NULL) terror("Could not create loading threads");\n+\n+\n+ HashTableArgs * hta = (HashTableArgs *) malloc(n_threads*sizeof(HashTableArgs));\n+ if(hta == NULL) terror("Could not allocate arguments for hash table");\n+ \n+ pthread_mutex_t lock; //The mutex to lock the queue\n+ if (pthread_mutex_init(&lock, NULL) != 0) terror("Could not init mutex");\n+\n+ // To be used if only computing hits\n+ uint64_t ** hits_table = NULL;\n+\n+ unsigned char ** my_x = (unsigned char **) malloc(n_threads * sizeof(unsigned char*));\n+ unsigned char ** my_y = (unsigned char **) malloc(n_threads * sizeof(unsigned char*));\n+\n+ unsigned char ** marker_taggs = NULL; // To be used with full comparison\n+\n+ struct positioned_cell ** mc = (struct positioned_cell **) malloc(n_threads * sizeof(struct positioned_cell *));\n+ struct cell *** table = (struct cell ***) malloc(n_threads * sizeof(struct cell **));\n+ if(table == NULL) terror("Could not allocate NW table");\n+ char ** reconstruct_X = (char **) malloc(n_threads * sizeof(char *));\n+ char ** reconstruct_Y = (char **) malloc(n_threads * sizeof(char *));\n+ if(reconstruct_Y == NULL || reconstruct_X == NULL) terror("Could not allocate output alignment sequences");\n+ char ** writing_buffer_alignment = (char **) malloc(n_threads * sizeof(char*));\n+ for(i=0;i<n_threads;i++){\n+\t\n+ \ttable[i] = (struct cell **) malloc(MAX_READ_SIZE * sizeof(struct cell *));\n+\t for(j=0;j<MAX_READ_SIZE;j++){\n+ table[i][j] = (struct cell *) malloc((1+MAX_WINDOW_SIZE)*sizeof(struct cell));\n+\t\t if(table[i][j] == NULL) terror("Could not allocate memory for second loop of table");\n+ '..b'_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts){\n+\n+ int pNum = 0;\n+ while(pNum < argc){\n+ if(strcmp(av[pNum], "--verbose") == 0) VERBOSE_ACTIVE = 1;\n+ if(strcmp(av[pNum], "--help") == 0){\n+ fprintf(stdout, "USAGE:\\n");\n+ fprintf(stdout, " IMSAME -query [query] -db [database]\\n");\n+ fprintf(stdout, "OPTIONAL:\\n");\n+ fprintf(stdout, " -n_threads [Integer: 0<n_threads] (default 4)\\n");\n+ fprintf(stdout, " -evalue [Double: 0<=pval<1] (default: 1 * 10^-10)\\n");\n+ fprintf(stdout, " -coverage [Double: 0<coverage<=1 (default: 0.5)\\n");\n+ fprintf(stdout, " -identity [Double: 0<identity<=1 (default: 0.5)\\n");\n+ fprintf(stdout, " -igap [Integer: (default: 5)\\n");\n+ fprintf(stdout, " -egap [Integer: (default: 2)\\n");\n+ fprintf(stdout, " -window [Double: 0<window<=0.5 (default: 0.15)\\n");\n+ fprintf(stdout, " -kmer [Integer: k>1 (default 12)]\\n");\n+\t fprintf(stdout, "\t\t-n_parts [Integer:\tn>0 (default 3)]\\n");\n+ fprintf(stdout, " -out [File path]\\n");\n+ fprintf(stdout, " --full Does not stop at first match and reports all equalities\\n");\n+ fprintf(stdout, " --verbose Turns verbose on\\n");\n+ fprintf(stdout, " --help Shows help for program usage\\n");\n+ fprintf(stdout, " --hits Compute only the non-overlapping hits\\n");\n+ exit(1);\n+ }\n+ if(strcmp(av[pNum], "--full") == 0) *full_comp = TRUE;\n+ if(strcmp(av[pNum], "--hits") == 0) *hits_only = TRUE;\n+\tif(strcmp(av[pNum], "-n_parts") == 0) *n_parts = (uint64_t) atoi(av[pNum+1]);\n+ if(strcmp(av[pNum], "-query") == 0){\n+ *query = fopen64(av[pNum+1], "rt");\n+ if(query==NULL) terror("Could not open query file");\n+ }\n+ if(strcmp(av[pNum], "-db") == 0){\n+ *database = fopen64(av[pNum+1], "rt");\n+ if(database==NULL) terror("Could not open database file");\n+ }\n+ if(strcmp(av[pNum], "-out") == 0){\n+ *out_database = fopen64(av[pNum+1], "wt");\n+ if(out_database==NULL) terror("Could not open output database file");\n+ }\n+ if(strcmp(av[pNum], "-evalue") == 0){\n+ *minevalue = (long double) atof(av[pNum+1]);\n+ if(*minevalue < 0) terror("Min-e-value must be larger than zero");\n+ }\n+ if(strcmp(av[pNum], "-window") == 0){\n+ *window = (long double) atof(av[pNum+1]);\n+ if(*window <= 0 || *window > 0.5) terror("Window percentage size must lie between 0<window<=0.5");\n+ }\n+ if(strcmp(av[pNum], "-coverage") == 0){\n+ *mincoverage = (long double) atof(av[pNum+1]);\n+ if(*mincoverage <= 0) terror("Min-coverage must be larger than zero");\n+ }\n+ if(strcmp(av[pNum], "-identity") == 0){\n+ *minidentity = (long double) atof(av[pNum+1]);\n+ if(*minidentity <= 0) terror("Min-identity must be larger than zero");\n+ }\n+ if(strcmp(av[pNum], "-igap") == 0){\n+ *igap = - (atoi(av[pNum+1]));\n+ }\n+ if(strcmp(av[pNum], "-egap") == 0){\n+ *egap = - (atoi(av[pNum+1]));\n+ }\n+ if(strcmp(av[pNum], "-n_threads") == 0){\n+ *n_threads = (uint64_t) atoi(av[pNum+1]);\n+ if(*n_threads < 0) terror("Number of threads must be larger than zero");\n+ }\n+ if(strcmp(av[pNum], "-kmer") == 0){\n+ *custom_kmer = (uint64_t) atoi(av[pNum+1]);\n+ if(*custom_kmer < 2) terror("K-mer size must be larger than 1");\n+ }\n+ pNum++;\n+ }\n+ \n+ if(*query==NULL || *database==NULL) terror("A query and database is required");\n+}\n+\n' |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/Makefile Sat Dec 15 18:04:10 2018 -0500 |
b |
@@ -0,0 +1,23 @@ +CC=gcc +CXX=g++ +CFLAGS=-O3 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -Wall #-DVERBOSE +BIN=../bin + +all: IMSAME revcomp combine covident + + + +IMSAME: IMSAME.c + $(CC) $(CFLAGS) alignmentFunctions.c -lpthread -lm commonFunctions.c -msse -lm IMSAME.c -lpthread -lm -o $(BIN)/IMSAME + +revcomp: reverseComplement.c + $(CC) $(CFLAGS) commonFunctions.c -lm reverseComplement.c -o $(BIN)/revComp + +combine: combine_reads.c + $(CC) $(CFLAGS) commonFunctions.c -lm combine_reads.c -o $(BIN)/combine + +covident: covident.c + $(CC) $(CFLAGS) covident.c -o $(BIN)/combine + +clean: + rm -rf $(BIN)/IMSAME $(BIN)/revComp $(BIN)/combine |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/alignmentFunctions.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/alignmentFunctions.c Sat Dec 15 18:04:10 2018 -0500 |
[ |
b'@@ -0,0 +1,1718 @@\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <ctype.h>\n+#include <pthread.h>\n+#include <inttypes.h>\n+#include <math.h>\n+#include <float.h>\n+#include "structs.h"\n+#include "alignmentFunctions.h"\n+#include "commonFunctions.h"\n+#define MAX(x, y) (((x) > (y)) ? (x) : (y))\n+#define MIN(x, y) (((x) <= (y)) ? (x) : (y))\n+\n+int64_t compare_letters(unsigned char a, unsigned char b){\n+ if(a != (unsigned char) \'N\' && a != (unsigned char) \'>\') return (a == b) ? POINT : -POINT;\n+ return -POINT;\n+}\n+\n+llpos * getNewLocationllpos(Mempool_l * mp, uint64_t * n_pools_used){\n+\n+ if(mp[*n_pools_used].current == POOL_SIZE){\n+ *n_pools_used += 1;\n+ if(*n_pools_used == MAX_MEM_POOLS) terror("Reached max pools");\n+ init_mem_pool_llpos(&mp[*n_pools_used]);\n+ \n+ }\n+\n+ llpos * new_pos = mp[*n_pools_used].base + mp[*n_pools_used].current;\n+ mp[*n_pools_used].current++;\n+\n+ \n+ return new_pos;\n+}\n+\n+void init_mem_pool_llpos(Mempool_l * mp){\n+ mp->base = (llpos *) calloc(POOL_SIZE, sizeof(llpos));\n+ if(mp->base == NULL) terror("Could not request memory pool");\n+ mp->current = 0;\n+}\n+\n+\n+void * load_input(void * a){\n+\n+ LoadingDBArgs * ldbargs = (LoadingDBArgs *) a;\n+ \n+ // Requires\n+ /*\n+ char * temp_seq_buffer;\n+ SeqInfo * data_database; 1 per thread\n+ uint64_t t_len;\n+ uint64_t word_size;\n+ uint64_t read_from;\n+ uint64_t read_to;\n+ char thread_id;\n+ */\n+\n+ uint64_t c_pos;\n+ \n+ unsigned char curr_kmer[custom_kmer];\n+ unsigned char aux_kmer[custom_kmer+1];\n+ curr_kmer[0] = \'\\0\';\n+ uint64_t word_size = 0, pos_in_database = 0;\n+ unsigned char char_converter[91];\n+ uint64_t curr_seq = 0;\n+ char_converter[(unsigned char)\'A\'] = 0;\n+ char_converter[(unsigned char)\'C\'] = 1;\n+ char_converter[(unsigned char)\'G\'] = 2;\n+ char_converter[(unsigned char)\'T\'] = 3;\n+ //llpos * aux;\n+ AVLTree * pointer;\n+\n+ char c;\n+\n+ /*\n+ if(ldbargs->thread_id == \'A\'){\n+ printf("read to is: %"PRIu64"\\n", ldbargs->read_to);\n+ printf("make sure: %c %c %c\\n", ldbargs->temp_seq_buffer[ldbargs->read_to-1], ldbargs->temp_seq_buffer[ldbargs->read_to], ldbargs->temp_seq_buffer[ldbargs->read_to+1]);\n+ \n+ uint64_t z = ldbargs->read_to-1;\n+ while(ldbargs->temp_seq_buffer[z] != \'>\'){\n+ printf("%c", ldbargs->temp_seq_buffer[z]);\n+ z--;\n+ }\n+ getchar();\n+ }\n+ if(ldbargs->thread_id == \'C\'){\n+ printf("HELLOOOOOOO im going from %"PRIu64"\\n", ldbargs->read_from);\n+ }\n+ */\n+\n+ c_pos = ldbargs->read_from;\n+ while(ldbargs->temp_seq_buffer[c_pos] != \'>\') ++c_pos;\n+ ldbargs->read_from = c_pos;\n+ c_pos = ldbargs->read_to;\n+ while(c_pos < ldbargs->t_len && ldbargs->temp_seq_buffer[c_pos] != \'>\') ++c_pos;\n+ ldbargs->read_to = c_pos;\n+\n+ c_pos = ldbargs->read_from;\n+ c = ldbargs->temp_seq_buffer[c_pos];\n+\n+ \n+ //printf("thread going from %"PRIu64" to %"PRIu64"\\n", ldbargs->read_from, ldbargs->read_to);\n+ \n+\n+ while(c_pos < ldbargs->read_to){\n+ \n+\n+ if(c == \'>\'){\n+ \n+ //if(ldbargs->thread_id == \'G\') printf("putting in %"PRIu64" @ %"PRIu64"\\n", curr_seq, c_pos);\n+ ldbargs->data_database->start_pos[curr_seq] = pos_in_database; ++curr_seq;\n+\n+ // REalloc sequences and sequence index\n+ if(pos_in_database == READBUF*ldbargs->n_allocs){\n+ ldbargs->n_allocs++; ldbargs->data_database->sequences = (unsigned char *) realloc(ldbargs->data_database->sequences, READBUF*ldbargs->n_allocs*sizeof(unsigned char));\n+ if(ldbargs->data_database->sequences == NULL) terror("Could not reallocate temporary database");\n+ }\n+\n+ if(curr_seq == INITSEQS*ldbargs->n_allocs){\n+ ldbargs->n_allocs++; ldbargs->data_database->start_pos = (uint64_t *) realloc(ldbargs->data_database->start_pos, INITSEQS'..b'height = MAX(height(y->left), height(y->right))+1;\n+ \n+ // Return new root\n+ return y;\n+}\n+ \n+// Get Balance factor of node N\n+\n+int64_t get_balance(AVLTree * N){\n+ if (N == NULL)\n+ return 0;\n+ return height(N->left) - height(N->right);\n+}\n+\n+/* Substituted by (node == NULL) ? (0) : ((int64_t) node->left->height - (int64_t) node->right->height) */\n+\n+AVLTree * find_AVLTree(AVLTree * node, uint64_t key){\n+ AVLTree * found = NULL;\n+ if(node == NULL) return NULL;\n+\n+ if (key < node->key) {\n+ found = find_AVLTree(node->left, key);\n+ } else if (key > node->key) {\n+ found = find_AVLTree(node->right, key);\n+ } else { \n+ return node;\n+ }\n+ return found;\n+} \n+\n+llpos * find_AVLTree_llpos(AVLTree * node, uint64_t key){\n+ llpos * aux = NULL;\n+ if(node == NULL) return NULL;\n+\n+ if (key < node->key) {\n+ aux = find_AVLTree_llpos(node->left, key);\n+ } else if (key > node->key) {\n+ aux = find_AVLTree_llpos(node->right, key);\n+ } else { \n+ return node->next;\n+ }\n+ return aux;\n+}\n+\n+// Recursive function to insert key in subtree rooted\n+// with node and returns new root of subtree.\n+AVLTree * insert_AVLTree(AVLTree * node, uint64_t key, Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t pos, Mempool_l * mp_l, uint64_t * n_pools_used_l, uint64_t s_id){\n+ /* 1. Perform the normal BST insertion */\n+ if (node == NULL){\n+ \n+ AVLTree * n_node = getNewLocationAVLTree(mp, n_pools_used, key);\n+ llpos * aux = getNewLocationllpos(mp_l, n_pools_used_l);\n+ aux->pos = pos;\n+ aux->s_id = s_id;\n+ n_node->next = aux;\n+ return n_node;\n+ }\n+ \n+ if (key < node->key) {\n+ node->left = insert_AVLTree(node->left, key, mp, n_pools_used, pos, mp_l, n_pools_used_l, s_id);\n+ } else if (key > node->key) {\n+ node->right = insert_AVLTree(node->right, key, mp, n_pools_used, pos, mp_l, n_pools_used_l, s_id);\n+ } else { \n+ // Equal keys are inserted as a linked list\n+ llpos * aux = getNewLocationllpos(mp_l, n_pools_used_l);\n+ aux->pos = pos;\n+ aux->s_id = s_id;\n+ aux->next = node->next;\n+ node->next = aux;\n+ ++(node->count);\n+ return node;\n+ }\n+ \n+ /* 2. Update height of this ancestor node */\n+ //node->height = 1 + MAX((node->left == NULL) ? (0) : (node->left->height), (node->right == NULL) ? (0) : (node->right->height));\n+ node->height = 1 + MAX(height(node->left), height(node->right));\n+ \n+ /* 3. Get the balance factor of this ancestor\n+ node to check whether this node became\n+ unbalanced */\n+ //int64_t balance = (node->left == NULL || node->right == NULL) ? (0) : ((int64_t) node->left->height - (int64_t) node->right->height);\n+ int64_t balance = get_balance(node);\n+ \n+ // If this node becomes unbalanced, then\n+ // there are 4 cases\n+ \n+ // Left Left Case\n+ if (balance > 1 && key < node->left->key)\n+ return right_rotate(node);\n+ \n+ // Right Right Case\n+ if (balance < -1 && key > node->right->key)\n+ return left_rotate(node);\n+ \n+ // Left Right Case\n+ if (balance > 1 && key > node->left->key)\n+ {\n+ node->left = left_rotate(node->left);\n+ return right_rotate(node);\n+ }\n+ \n+ // Right Left Case\n+ if (balance < -1 && key < node->right->key)\n+ {\n+ node->right = right_rotate(node->right);\n+ return left_rotate(node);\n+ }\n+ \n+ /* return the (unchanged) node pointer */\n+ return node;\n+}\n+ \n+// A utility function to print preorder traversal\n+// of the tree.\n+// The function also prints height of every node\n+\n+void pre_order(AVLTree * root){\n+ if(root != NULL){\n+ printf("%"PRIu64" ", root->key);\n+ llpos * aux = root->next;\n+ while(aux != NULL){ printf("#%"PRIu64", ", aux->pos); aux = aux->next; }\n+ pre_order(root->left);\n+ pre_order(root->right);\n+ }\n+}\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/alignmentFunctions.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/alignmentFunctions.h Sat Dec 15 18:04:10 2018 -0500 |
b |
@@ -0,0 +1,136 @@ +#define QF_LAMBDA 0.275 +#define QF_KARLIN 0.333 + + + +typedef struct { + uint64_t id; //The thread id + SeqInfo * database; //Database sequence and lengths + SeqInfo * query; //Query sequence and lengths + uint64_t from; //Starting READ to compute alignments from + uint64_t to; //End READ to compute alignments from + AVLContainer * container_a; //Container to hold the multidimensional array + AVLContainer * container_b; //Container to hold the multidimensional array + AVLContainer * container_c; //Container to hold the multidimensional array + AVLContainer * container_d; //Container to hold the multidimensional array + uint64_t * contained_reads; + uint64_t * base_coordinates; + uint64_t accepted_query_reads; //Number of reads that have a fragment with evalue less than specified + long double min_e_value; //Minimum evalue to accept read + long double min_coverage; //Minimum coverage percentage to accept read + long double min_identity; //Minimum identity percentage to accept read + long double window; //Percentage of window that will be explored (+-) + FILE * out; //File to write alignments out + int igap; + int egap; + uint64_t * hits; // To work in hits mode only + struct positioned_cell * mc; + struct cell ** table; + char * reconstruct_X; + char * reconstruct_Y; + char * writing_buffer_alignment; + unsigned char * my_x; + unsigned char * my_y; + Head * queue_head; //To tell where the queue starts after modifications + pthread_mutex_t * lock; + unsigned char full_comp; // Tells whether read reporting should stop at first match or keep reporting + unsigned char * markers; // To tell which sequences were already used +} HashTableArgs; + + + +/* + Nucleotides matching function +*/ +int64_t compare_letters(unsigned char a, unsigned char b); + +/** + * Initialize the memory pool to later retrieve individual memory addresses for llpos + * + */ +void init_mem_pool_llpos(Mempool_l * mp); + +/** + * Get a new memory address from the pool mp for a type llpos + * + */ +llpos * getNewLocationllpos(Mempool_l * mp, uint64_t * n_pools_used); + +/* + Load input database using 4 threads +*/ +void * load_input(void * a); +/* + Compute alignments by thread given a hash table argument +*/ +void * computeAlignmentsByThread(void * a); + + +/* + Performs NW and backtracking to recover alignment +*/ +void build_alignment(char * reconstruct_X, char * reconstruct_Y, uint64_t curr_db_seq, uint64_t curr_read, HashTableArgs * hta, unsigned char * my_x, unsigned char * my_y, struct cell ** table, struct positioned_cell * mc, char * writing_buffer_alignment, BasicAlignment * ba, uint64_t xlen, uint64_t ylen, int64_t * cell_path_y, long double * window); + +/* + Compute the alignment and evalue of a given hit + The positions pos_database and pos_query refer to the last match in the hit +*/ +void alignmentFromQuickHits(SeqInfo * database, SeqInfo * query, uint64_t pos_database, uint64_t pos_query, uint64_t curr_read, uint64_t curr_db_seq, Quickfrag * qf, uint64_t offset_db_reads, uint64_t offset_db_coordinates); + +/* + Computes the cell path for the y points given incremental x + Only add +- window size to each to know which path to go through +*/ +void calculate_y_cell_path(Point p0, Point p1, Point p2, Point p3, int64_t * cell_path_y); +/* + Calculates NW table with two rows and stores a cellpath of scores, identities, gaps and starting and ending positions +*/ +struct best_cell NW(unsigned char * X, uint64_t Xstart, uint64_t Xend, unsigned char * Y, uint64_t Ystart, uint64_t Yend, int64_t iGap, int64_t eGap, struct cell ** table, struct positioned_cell * mc, int show, int64_t * cell_path_y, long double * window, uint64_t * curr_window_size); + +/* + Computes the alignment given a NW table +*/ +void backtrackingNW(unsigned char * X, uint64_t Xstart, uint64_t Xend, unsigned char * Y, uint64_t Ystart, uint64_t Yend, struct cell ** table, char * rec_X, char * rec_Y, struct best_cell * bc, uint64_t * ret_head_x, uint64_t * ret_head_y, BasicAlignment * ba, int64_t * cell_path_y, uint64_t window_size); + +/* + Get memory for a new AVL tree node +*/ +AVLTree * getNewLocationAVLTree(Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t key); + +/* + Initialize a memory pool for AVL trees +*/ +void init_mem_pool_AVL(Mempool_AVL * mp); + +/* + Right rotate an AVL tree to make it balanced +*/ +AVLTree * right_rotate(AVLTree * y); + +/* + Left rotate an AVL tree to make it balanced +*/ +AVLTree * left_rotate(AVLTree * x); + + +/* + Find a key in an AVL tree +*/ +AVLTree * find_AVLTree(AVLTree * node, uint64_t key); + +/* + Find a key in an AVL tree but return its hit list +*/ +llpos * find_AVLTree_llpos(AVLTree * node, uint64_t key); + +/* + Insert node in AVL tree +*/ +AVLTree * insert_AVLTree(AVLTree * node, uint64_t key, Mempool_AVL * mp, uint64_t * n_pools_used, uint64_t pos, Mempool_l * mp_l, uint64_t * n_pools_used_l, uint64_t s_id); + +/* + Traverse AVL tree in pre order +*/ +void pre_order(AVLTree * root); + + |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/combine_reads.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/combine_reads.c Sat Dec 15 18:04:10 2018 -0500 |
[ |
@@ -0,0 +1,128 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "structs.h" +#include "commonFunctions.h" + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) <= (y)) ? (x) : (y)) +#define STARTING_SEQS 1000 +#define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes +#define MATVAL 101 + +// void terror(char *s) { +// fprintf(stdout, "ERR**** %s ****\n", s); +// exit(-1); +// } + + +int main(int argc, char ** av){ + + if(argc != 2) terror("USE:: combine_reads <file>"); + + + char names[10][100] = {"14np", "47np", "BR0716C", "BR11044P", "CA01044C", "CA01067C", "CA11149P", "CA11154P", "MA07066C", "MA09032P"}; + uint64_t llen[10] = {1758451, 959020, 1616163, 884193, 934256, 1007436, 1670521, 584151, 671010, 516632}; + uint64_t nseqs = 10; + + FILE * results, * data; + data = fopen(av[1], "rt"); + if(data == NULL) terror("Could not open input file"); + + uint64_t * mat = (uint64_t *) calloc(MATVAL, sizeof(uint64_t)); + if(mat == NULL) terror("Could not allocate matrix array"); + long double * mat_e = (long double *) calloc(MATVAL, sizeof(long double)); + if(mat_e == NULL) terror("Could not allocate float table"); + + char current_file[100]; + uint64_t i=5, idx=0; + while(av[1][i] != '_' && av[1][i+1] != 'g'){ + current_file[idx++] = av[1][i]; + i++; + } + current_file[idx] = '\0'; + fprintf(stdout, "Current file is %s\n", current_file); + + char buffer[MAXLID]; + if ((results = fopen("accu.log", "r")) == NULL){ + results = fopen("accu.log", "wt"); + }else{ + // Load the matrix + + + for(i=0;i<100;i++){ + if(0 == fgets(buffer, MAXLID, results)) terror("Missing number on load"); + + //fprintf(stdout, "Have %s\n", buffer); + buffer[strlen(buffer)-1] = '\0'; + //mat[i] = asciiToUint64(buffer); + mat_e[i] = (long double) atof(buffer); + //fprintf(stdout, "%"PRIu64"\n", mat[i]); + //getchar(); + } + fclose(results); + results = fopen("accu.log", "wt"); // Re open + } + + // Read file + uint64_t read_id_1, read_id_2, coverage, identity, length, current, currmax, j, last = 100000, lastmax = 0xFFFFFFFFFFFFFFFF; + while(!feof(data)){ + if(0 == fgets(buffer, MAXLID, data) && !feof(data)) terror("Missing values"); + // 2 77277 89 64 213 + sscanf(buffer, "%"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64, &read_id_1, &read_id_2, &coverage, &identity, &length); + //fprintf(stdout, "%"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64" %"PRIu64"\n", read_id_1, read_id_2, coverage, identity, length); + currmax = MIN(identity, coverage); + //fprintf(stdout, "%"PRIu64"\n", currmax); + current = read_id_1; + /* + for(j=currmax; j > 1; j--){ + if(current != lasts[j]){ + mat[j]++; + lasts[j] = current; + } + } + */ + if(lastmax == 0xFFFFFFFFFFFFFFFF){ + last = current; + lastmax = 0; + } + if(current == last){ + //mat[currmax] += 1; + if(lastmax < currmax){ + lastmax = currmax; + } + }else{ + mat[lastmax]++; + last = current; + lastmax = 0; + } + + } + + uint64_t divider = 0; + for(i=0; i < nseqs; i++){ + + if(strcmp(current_file, names[i]) == 0){ + divider = llen[i]; + } + } + if(divider == 0) terror("Could not find file for divider"); else fprintf(stdout, "The divider is %"PRIu64"\n", divider); + + for(j=99; j>0; j--){ + mat[j] += mat[j+1]; + } + mat[0] = mat[1]; + + for(j=0; j<100; j++){ + fprintf(stdout, "%"PRIu64" --> %.5f\n", mat[j], (float)((100*(long double)mat[j])/(long double)divider)); + + fprintf(results, "%.5f\n", (float)(mat_e[j] + ((100*(long double)mat[j])/(long double)divider))/2); + } + + + fclose(results); + fclose(data); + free(mat); + return 0; +} |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/commonFunctions.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/commonFunctions.c Sat Dec 15 18:04:10 2018 -0500 |
[ |
b'@@ -0,0 +1,283 @@\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <sys/time.h>\n+#include <inttypes.h>\n+#include <ctype.h>\n+#include <string.h>\n+#include <pthread.h>\n+#include <emmintrin.h>\n+#include <math.h>\n+#include "structs.h"\n+\n+#define ALIGNED_BYTES 16\n+\n+void terror(char *s) {\n+ fprintf(stdout, "ERR**** %s ****\\n", s);\n+ exit(-1);\n+}\n+\n+char buffered_fgetc(char *buffer, uint64_t *pos, uint64_t *read, FILE *f) {\n+ if (*pos >= READBUF) {\n+ *pos = 0;\n+ memset(buffer, 0, READBUF);\n+ *read = fread(buffer, 1, READBUF, f);\n+ }\n+ *pos = *pos + 1;\n+ return buffer[*pos-1];\n+}\n+\n+void get_num_seqs_and_length(char * seq_buffer, uint64_t * n_seqs, uint64_t * t_len, LoadingDBArgs * ldbargs){\n+ char check[16] = ">>>>>>>>>>>>>>>>";\n+ uint64_t a_fourth = *t_len / 4;\n+\n+ uint64_t i=0;\n+ \n+ __m128i * ptr1;\n+ __m128i * ptr2;\n+ __m128i res = _mm_setzero_si128();\n+ ptr2 = (__m128i *) check;\n+\n+ ldbargs[0].read_from = 0;\n+ for(i=0; i<a_fourth - 16; i+=16){\n+ ptr1 = (__m128i *) &seq_buffer[i];\n+ res = _mm_cmpeq_epi8(*ptr1, *ptr2);\n+ // TODO parallelize this\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[0]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[1]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[2]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[3]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[4]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[5]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[6]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[7]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[8]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[9]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[10]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[11]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[12]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[13]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[14]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[15]) >> 7;\n+ }\n+ while(seq_buffer[i] != \'>\'){ ++i; }\n+ ldbargs[0].read_to = i;\n+ ldbargs[0].data_database->n_seqs = *n_seqs;\n+ ldbargs[1].read_from = i;\n+ while(i % 16 != 0){ ++i; if(seq_buffer[i] == \'>\') terror("Sequence of length < 16"); }\n+ if(i > ldbargs[1].read_from) ++(*n_seqs); // For the advance to align bytes\n+\n+ //printf("seqs at 0: %"PRIu64" readfrom: %"PRIu64", readto: %"PRIu64", \\n", ldbargs[0].data_database->n_seqs, ldbargs[0].read_from, ldbargs[0].read_to);\n+\n+ \n+ for(i=i; i<2*(a_fourth) - 16; i+=16){\n+ ptr1 = (__m128i *) &seq_buffer[i];\n+ res = _mm_cmpeq_epi8(*ptr1, *ptr2);\n+ // TODO parallelize this\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[0]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[1]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[2]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[3]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[4]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[5]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[6]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[7]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[8]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[9]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[10]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[11]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[12]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[13]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[14]) >> 7;\n+ *n_seqs += (uint64_t) (((unsigned char *) &res)[15]) >> 7;\n+ }\n+ while(seq_buffer[i'..b'\t if(from >= t_reads){\n+\t\t queues[current_queue-1].next = NULL;\n+ return &queues[0];\n+ }\n+\n+\n+ if(i == levels - 1 && j == (i+1)*n_threads - 1){\n+ //If its the last \n+ queues[current_queue].next = NULL;\n+ }else{\n+ //Else add it to the queue\n+ queues[current_queue].next = &queues[current_queue+1];\n+ }\n+ \n+\n+ queues[current_queue].r1 = from;\n+ queues[current_queue].r2 = (to <= t_reads) ? (to) : (t_reads);\n+ current_queue++;\n+ //printf("current_piece: %"PRIu64"-%"PRIu64" diff: %"PRIu64"\\n", from, to, to - from);\n+\n+ }\n+\n+ }\n+ //printf("TREADS was %"PRIu64"\\n", t_reads); \n+ queues[current_queue-1].r2 = t_reads;\n+ return &queues[0];\n+}\n+\n+void print_queue(Queue * q){\n+ fprintf(stdout, "Task: %"PRIu64"-%"PRIu64"\\n", q->r1, q->r2);\n+}\n+\n+Queue * get_task_from_queue(Head * queue_head, pthread_mutex_t * lock){\n+ pthread_mutex_lock(lock);\n+\n+ Queue * ptr = queue_head->head;\n+ if(queue_head->head != NULL) queue_head->head = queue_head->head->next;\n+ //if(ptr != NULL){ printf("Taking "); /*print_queue(ptr);*/ }\n+\n+ pthread_mutex_unlock(lock);\n+\n+\n+ return ptr;\n+}\n+\n+uint64_t quick_pow4(uint64_t n){\n+ static uint64_t pow4[33]={1L, 4L, 16L, 64L, 256L, 1024L, 4096L, 16384L, 65536L, \n+ 262144L, 1048576L, 4194304L, 16777216L, 67108864L, 268435456L, 1073741824L, 4294967296L, \n+ 17179869184L, 68719476736L, 274877906944L, 1099511627776L, 4398046511104L, 17592186044416L, \n+ 70368744177664L, 281474976710656L, 1125899906842624L, 4503599627370496L, 18014398509481984L, \n+ 72057594037927936L, 288230376151711744L, 1152921504606846976L, 4611686018427387904L};\n+ return pow4[n];\n+}\n+\n+uint64_t quick_pow4byLetter(uint64_t n, const char c){\n+ static uint64_t pow4_G[33]={2*1L, 2*4L, 2*16L, 2*64L, 2*256L, 2*1024L, 2*4096L, 2*16384L, 2*65536L, \n+ (uint64_t)2*262144L, (uint64_t)2*1048576L,(uint64_t)2*4194304L, (uint64_t)2*16777216L, (uint64_t)2*67108864L, (uint64_t)2*268435456L, (uint64_t)2*1073741824L, (uint64_t)2*4294967296L, \n+ (uint64_t)2*17179869184L, (uint64_t)2*68719476736L, (uint64_t)2*274877906944L, (uint64_t)2*1099511627776L, (uint64_t)2*4398046511104L, (uint64_t)2*17592186044416L, \n+ (uint64_t)2*70368744177664L, (uint64_t)2*281474976710656L, (uint64_t)2*1125899906842624L, (uint64_t)2*4503599627370496L, (uint64_t)2*18014398509481984L, \n+ (uint64_t)2*72057594037927936L, (uint64_t) 2*288230376151711744L, (uint64_t) 2*1152921504606846976L, (uint64_t) 2*4611686018427387904L};\n+ \n+ static uint64_t pow4_T[33]={3*1L, 3*4L, 3*16L, 3*64L, 3*256L, 3*1024L, 3*4096L, 3*16384L, 3*65536L, \n+ (uint64_t)3*262144L, (uint64_t) 3*1048576L, (uint64_t)3*4194304L, (uint64_t)3*16777216L, (uint64_t)3*67108864L, (uint64_t)3*268435456L, (uint64_t)3*1073741824L, (uint64_t)3*4294967296L, \n+ (uint64_t)3*17179869184L, (uint64_t)3*68719476736L, (uint64_t)3*274877906944L, (uint64_t)3*1099511627776L, (uint64_t)3*4398046511104L, (uint64_t)3*17592186044416L, \n+ (uint64_t)3*70368744177664L, (uint64_t)3*281474976710656L, (uint64_t)3*1125899906842624L, (uint64_t)3*4503599627370496L, (uint64_t)3*18014398509481984L, \n+ (uint64_t)3*72057594037927936L, (uint64_t) 3*288230376151711744L, (uint64_t) 3*1152921504606846976L, (uint64_t) 3*4611686018427387904L};\n+ \n+ if(c == \'A\') return 0;\n+ if(c == \'C\') return quick_pow4(n);\n+ if(c == \'G\') return pow4_G[n];\n+ if(c == \'T\') return pow4_T[n];\n+ return 0;\n+}\n+\n+uint64_t hashOfWord(const unsigned char * word, uint32_t k){\n+ \n+ uint64_t value = 0, jIdx;\n+ for(jIdx=0;jIdx<k;jIdx++){\n+ value += quick_pow4byLetter(k-(jIdx+1), (char) word[jIdx]);\n+ }\n+ return value;\n+}\n+\n+\n+uint64_t asciiToUint64(const char *text){\n+\tuint64_t number=0;\n+\n+\tfor(;*text;text++){\n+\t\tchar digit=*text-\'0\'; \n+\t\tnumber=(number*10)+digit;\n+\t}\n+\treturn number;\n+}\n' |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/commonFunctions.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/commonFunctions.h Sat Dec 15 18:04:10 2018 -0500 |
b |
@@ -0,0 +1,41 @@ +#ifndef COMMON_FUNCTIONS_H +#define COMMON_FUNCTIONS_H +#include "structs.h" +/** + * Print the error message 's' and exit(-1) + */ +void terror(char *s); + + +/** + * Function to read char by char buffered from a FILE + */ +char buffered_fgetc(char *buffer, uint64_t *pos, uint64_t *read, FILE *f); + + +void get_num_seqs_and_length(char * seq_buffer, uint64_t * n_seqs, uint64_t * t_len, LoadingDBArgs * ldbargs); + +/* + Generates a queue of tasks for threads +*/ +Queue * generate_queue(Head * queue_head, uint64_t t_reads, uint64_t n_threads, uint64_t levels); + +/* + Prints a queue task +*/ +void print_queue(Queue * q); + +/* + Gets the next task to do when a pthread is free +*/ +Queue * get_task_from_queue(Head * queue_head, pthread_mutex_t * lock); + +uint64_t quick_pow4(uint64_t n); + +uint64_t quick_pow4byLetter(uint64_t n, const char c); + +uint64_t hashOfWord(const unsigned char * word, uint32_t k); + +uint64_t asciiToUint64(const char *text); + +#endif /* COMMON_FUNCTIONS_H */ |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/covident.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/covident.c Sat Dec 15 18:04:10 2018 -0500 |
[ |
@@ -0,0 +1,65 @@ +/* +for i in $IMS/_distribution; do ./covident $i computed/$(basename $i); done +gcc -O3 -D_FILE_OFFSET_BITS=64 covident.c -o covident +**********/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <inttypes.h> + +#define MAXLID 200 +#define OUT_MATSIZE 101 + + + +int main(int argc, char ** av){ + + if(argc != 3){ printf("USE:covident in output.MAT\n"); exit(-1); } + + + FILE * f, * g; + f = fopen(av[1], "rt"); + if(f == NULL){ + printf("Error opening input file\n"); + exit(-1); + } + g = fopen(av[2], "wt"); + if(g == NULL){ + printf(" Error opening output file %s\n", av[2]); + exit(-1); + } + + + uint64_t matrix[OUT_MATSIZE][OUT_MATSIZE]; + uint64_t i,j; + for(i=0;i<OUT_MATSIZE;i++){ + for(j=0;j<OUT_MATSIZE;j++){ + matrix[i][j] = 0; + } + } + uint64_t cov, ident, len; + while(!feof(f)){ + + if(3 != fscanf(f, "%"PRIu64" %"PRIu64" %"PRIu64, &cov, &ident, &len) && !feof(f)){ printf("Did not read 3\n"); exit(-1);} + matrix[cov][ident]++; + + + } + + + for(i=0;i<OUT_MATSIZE;i++){ + for(j=0;j<OUT_MATSIZE;j++){ + fprintf(g, "%"PRIu64"\t", matrix[i][j]); + } + fprintf(g, "\n"); + } + + fclose(f); + fclose(g); + return 0; +} + + + + |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/reverseComplement.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/reverseComplement.c Sat Dec 15 18:04:10 2018 -0500 |
[ |
@@ -0,0 +1,123 @@ +/* reverseComplement.c + Program reading a FASTA file as input (one or multiple sequence) + and then returning the reverse complementary sequence in the output file. + It is important to note that for multiple sequence file the first input + sequence is the last one at the output file and the last one in the input + is the first one in the output. + oscart@uma.es + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <inttypes.h> +#include "commonFunctions.h" + +#define SEQSIZE 2000000000 +#define READINPUT 10000 +#define WSIZE 32 +#define NREADS 1000000 + +int main(int ac, char** av) { + FILE *fIn, *fOut; + int64_t i, j, nR, seqLen = 0; + char *seq, c, toW; + long *offset = NULL; + + if (ac != 3) + terror("USE: reverseComplement seqFile.IN reverseComplementarySeq.OUT"); + + /** + * Allocating memory for the sequence + * Fixed amount of memory, change it to loadSeqDB? + */ + if ((seq = (char*) malloc(SEQSIZE)) == NULL) + terror("memory for Seq"); + + if ((fIn = fopen(av[1], "rt")) == NULL) + terror("opening IN sequence FASTA file"); + + if ((fOut = fopen(av[2], "wt")) == NULL) + terror("opening OUT sequence Words file"); + + if ((offset = (long *) malloc(sizeof(long)*NREADS)) == NULL) + terror("memory for offsets"); + + for(i=0;i<NREADS;i++){ + offset[i]=0; + } + + nR = 0; + c = fgetc(fIn); + while(!feof(fIn)){ + if(c == '>'){ + offset[nR++] = ftell(fIn)-1; + } + c = fgetc(fIn); + } + + for(i=nR-1; i>=0; i--){ + fseek(fIn, offset[i], SEEK_SET); + //READ and write header + fgets(seq, READINPUT, fIn); + + fprintf(fOut, "%s", seq); + //READ and write sequence + c = fgetc(fIn); + while(c != '>' && !feof(fIn)){ + if(isupper(c) || islower(c)){ + seq[seqLen++]=c; + } + c = fgetc(fIn); + } + for(j=seqLen-1; j >= 0; j--){ + switch(seq[j]){ + case 'A': + toW = 'T'; + break; + case 'C': + toW = 'G'; + break; + case 'G': + toW = 'C'; + break; + case 'T': + toW = 'A'; + break; + case 'U': + toW = 'A'; + break; + case 'a': + toW = 't'; + break; + case 'c': + toW = 'g'; + break; + case 'g': + toW = 'c'; + break; + case 't': + toW = 'a'; + break; + case 'u': + toW = 'a'; + break; + default: + toW = seq[j]; + break; + } + fwrite(&toW, sizeof(char), 1, fOut); + } + toW='\n'; + fwrite(&toW, sizeof(char), 1, fOut); + seqLen=0; + } + + fclose(fIn); + fclose(fOut); + + return 0; +} + + |
b |
diff -r 000000000000 -r 762009a91895 IMSAME/src/structs.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/IMSAME/src/structs.h Sat Dec 15 18:04:10 2018 -0500 |
[ |
@@ -0,0 +1,144 @@ +#ifndef STRUCTS_H +#define STRUCTS_H + +#include <inttypes.h> + +//Structs required for the dotplot workflow +#define MAXLID 200 +//#define READBUF 2000000000 //2 GB +#define READBUF 500000000 //500MB +#define INITSEQS 3000 //Number of starting sequences (in database) +#define POINT 4 + +#define FIXED_K 12 + +#define MAXLID 200 +#define ALIGN_LEN 60 //For NW alignment +#define MAX_READ_SIZE 10000 //Maximum length of read to have a portion of the table allocated +#define MAX_WINDOW_SIZE 1000 //Maximum window length to explore NW table +//#define POOL_SIZE 2500000000 // by 16 bytes it is 40 GB +#define POOL_SIZE 12500000 // 1 GB if 16 bytes +#define MAX_MEM_POOLS 128 +#define FIXED_LOADING_THREADS 4 + +#define FALSE 0 +#define TRUE 1 + +extern uint64_t custom_kmer; + +typedef char _vector_char __attribute__ ((vector_size (32*sizeof(char)))); + +//Struct for linked list of positions +typedef struct linked_list_pos{ + uint64_t pos; + uint64_t s_id; + //uint64_t extended_hash; + struct linked_list_pos * next; +} llpos; + +typedef struct AVL_Node{ + uint64_t key; + struct AVL_Node * left; + struct AVL_Node * right; + uint64_t height; + uint64_t count; + llpos * next; +} AVLTree; + +//Struct for memory pool por lists +typedef struct mempool_l{ + llpos * base; + uint64_t current; +} Mempool_l; + +//Struct for memory pool por AVLtree +typedef struct mempool_AVL{ + AVLTree * base; + uint64_t current; +} Mempool_AVL; + + +//Struct for a whole sequence(s) data +typedef struct seqinfo{ + unsigned char * sequences; + uint64_t * start_pos; + uint64_t total_len; + uint64_t n_seqs; +} SeqInfo; + +//Struct for the alignment of a quick hit +typedef struct quickfrag{ + uint64_t x_start; + uint64_t y_start; + uint64_t t_len; + long double coverage; + long double e_value; +} Quickfrag; + +typedef struct point{ + uint64_t x; + uint64_t y; +} Point; + +typedef struct container{ + llpos * table[4][4][4][4][4][4][4][4][4][4][4]; // One reduced; A,C,G,T tables in use +} Container; + +typedef struct AVLcontainer{ + AVLTree root[4][4][4][4][4][4][4][4][4][4][4]; +} AVLContainer; + +typedef struct{ + char * temp_seq_buffer; + SeqInfo * data_database; + uint64_t t_len; + uint64_t word_size; + uint64_t read_from; + uint64_t read_to; + uint64_t contained_reads; + uint64_t base_coordinates; + uint64_t n_allocs; + char thread_id; + AVLContainer * ct; + Mempool_l * mp; + uint64_t n_pools_used; + uint64_t n_pools_used_AVL; + Mempool_AVL * mp_AVL; +} LoadingDBArgs; + +struct cell{ + int64_t score; + uint64_t xfrom; + uint64_t yfrom; +}; + +struct positioned_cell{ + int64_t score; + uint64_t xpos; + uint64_t ypos; +}; + +struct best_cell{ + struct positioned_cell c; + uint64_t j_prime; +}; + +typedef struct{ + uint64_t identities; + uint64_t length; + uint64_t igaps; + uint64_t egaps; +} BasicAlignment; + +typedef struct queue{ + uint64_t r1; //reads region + uint64_t r2; + struct queue * next; +} Queue; + +typedef struct{ + Queue * head; +} Head; + + +#endif |