Mercurial > repos > padge > trimal
view trimal_repo/source/utils.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
line wrap: on
line source
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** trimAl v1.4: a tool for automated alignment trimming in large-scale phylogenetics analyses. readAl v1.4: a tool for automated alignment conversion among different formats. 2009-2015 Capella-Gutierrez S. and Gabaldon, T. [scapella, tgabaldon]@crg.es This file is part of trimAl/readAl. trimAl/readAl are free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, the last available version. trimAl/readAl are distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>. ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ #include "utils.h" #include "values.h" #include "defines.h" /*++++++++++++++++++++++++++++++++++++++++++++++++ | void utils::initVect(int *, int, int) | | This method is used to initializate all | | positions of a vector with a given value. | ++++++++++++++++++++++++++++++++++++++++++++++++*/ void utils::initlVect(int *vector, int tam, int valor) { for(int i = 0; i < tam; i++) vector[i] = valor; } void utils::initlVect(float *vector, int tam, float valor) { for(int i = 0; i < tam; i++) vector[i] = valor; } /*+++++++++++++++++++++++++++++++++++++++++++++ | void utils::copyVect(int *, int *, int) | | This method copies integer vector 1 to | | integer vector 2. | +++++++++++++++++++++++++++++++++++++++++++++*/ void utils::copyVect(int *vect1, int *vect2, int tam) { for(int i = 0; i < tam; i++) vect2[i] = vect1[i]; } /*+++++++++++++++++++++++++++++++++++++++++++++++ | void utils::copyVect(float *, float *, float) | | This method copies float vector 1 to | | float vector 2. | +++++++++++++++++++++++++++++++++++++++++++++++*/ void utils::copyVect(float *vect1, float *vect2, int tam) { for(int i = 0; i < tam; i++) vect2[i] = vect1[i]; } /*+++++++++++++++++++++++++++++++++++++++++ | int utils::roundToInf(double) | | This method rounds a double number | | to the inferior integer. | +++++++++++++++++++++++++++++++++++++++++*/ int utils::roundToInf(double number) { return ((int) number); } /*+++++++++++++++++++++++++++++++++++++++++ | int utils::roundInt(double) | | This method rounds a double number | | to a integer. | +++++++++++++++++++++++++++++++++++++++++*/ int utils::roundInt(double number) { return ((int) ((double) number + 0.5)); } /*+++++++++++++++++++++++++++++++++++++++++ | int utils::roundToSup(double) | | This method rounds a double number | | to the greater integer. | +++++++++++++++++++++++++++++++++++++++++*/ int utils::roundToSup(double number) { return ((int) ((double) number + 1.0)); } /*+++++++++++++++++++++++++++++++++++++++++ | int utils::max(int, int) | | This method returns the maximum | | value of the two given arguments. | +++++++++++++++++++++++++++++++++++++++++*/ int utils::max(int x, int y) { if(x > y) return x; else return y; } float utils::max(float x, float y) { if(x > y) return x; else return y; } double utils::max(double x, double y) { if(x > y) return x; else return y; } int utils::min(int x, int y) { if(x < y) return x; else return y; } float utils::min(float x, float y) { if(x < y) return x; else return y; } double utils::min(double x, double y) { if(x < y) return x; else return y; } /*+++++++++++++++++++++++++++++++++++++++++++ | bool utils::isNumber(char *) | | This method checks if the given | | string is a float number. | +++++++++++++++++++++++++++++++++++++++++++*/ bool utils::isNumber(char *num){ int tam = strlen(num); int i, flt = 1, expn = 1, sgn = 1; for(i = 0; i < tam; i++) { if(num[i] == '.' && flt) flt = 0; else if(((num[i] == 'e') ||(num[i] == 'E')) && expn) expn = 0; else if(((num[i] == '+') ||(num[i] == '-')) && sgn) { if(!expn) sgn = 0; } else if(num[i] > '9' || num[i] < '0') return false; } return true; } /*+++++++++++++++++++++++++++++++++++++++++++ | bool utils::compare(char *, char *) | | This method compares the two strings | | given, and returns true if the two | | strings are equal. | +++++++++++++++++++++++++++++++++++++++++++*/ bool utils::compare(char *a, char *b){ return(!strcmp(a,b)); } /*++++++++++++++++++++++++++++++++++++++++++ | void utils::removeSpaces(char *, char *) | | This method removes spaces in the | | input string and put the result in | | the output string. | ++++++++++++++++++++++++++++++++++++++++++*/ void utils::removeSpaces(char *in, char *out){ unsigned int i, j = 0; for(i = 0; i < strlen(in); i++){ if(in[i] != ' ' && in[i] != '\t'){ out[j] = in[i]; j++; } } out[j] = '\0'; } /*++++++++++++++++++++++++++++++++++++++++++ | void utils::quicksort(float *, int, int) | | This method sorts the vector using | | the quicksort method. | ++++++++++++++++++++++++++++++++++++++++++*/ void utils::quicksort(float *vect, int ini, int fin) { float elem_div; int i, j; if ((ini >= fin) || (fin < 0)) return; elem_div = vect[fin]; i = ini - 1; j = fin; while (1) { while (vect[++i] < elem_div) if(i == fin) break; while (vect[--j] > elem_div) if(j == 0) break; if(i < j) swap(&vect[i], &vect[j]); else break; } swap(&vect[i], &vect[fin]); quicksort(vect, ini, i - 1); quicksort(vect, i + 1, fin); } /*++++++++++++++++++++++++++++++++++++++++ | void utils::swap(float *, float *) | | This method swaps the values in a | | and b. | ++++++++++++++++++++++++++++++++++++++++*/ void utils::swap(float *a, float *b){ float temp; temp = *a; *a = *b; *b = temp; } /*++++++++++++++++++++++++++++++++++++++++++ | void utils::quicksort(float *, int, int) | | This method sorts the vector using | | the quicksort method. | ++++++++++++++++++++++++++++++++++++++++++*/ void utils::quicksort(int *vect, int ini, int fin) { int i, j, elem_div; if ((ini >= fin) || (fin < 0)) return; elem_div = vect[fin]; i = ini - 1; j = fin; while (1) { while (vect[++i] < elem_div) if(i == fin) break; while (vect[--j] > elem_div) if(j == 0) break; if(i < j) swap(&vect[i], &vect[j]); else break; } swap(&vect[i], &vect[fin]); quicksort(vect, ini, i - 1); quicksort(vect, i + 1, fin); } /*++++++++++++++++++++++++++++++++++++++++ | void utils::swap(float *, float *) | | This method swaps the values in a | | and b. | ++++++++++++++++++++++++++++++++++++++++*/ void utils::swap(int *a, int *b) { int temp; temp = *a; *a = *b; *b = temp; } void utils::quicksort(int **vect, int ini, int fin) { float elem_div; int i, j; /* ***** ***** ***** ***** ***** ***** ***** ***** */ if ((ini >= fin) || (fin < 0)) return; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ elem_div = vect[fin][0]; i = ini - 1; j = fin; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ while (true) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ while (vect[++i][0] < elem_div) if(i == fin) break; while (vect[--j][0] > elem_div) if(j == 0) break; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(i < j) swap(&vect[i], &vect[j]); else break; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ swap(&vect[i], &vect[fin]); /* ***** ***** ***** ***** ***** ***** ***** ***** */ quicksort(vect, ini, i - 1); quicksort(vect, i + 1, fin); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } void utils::swap(int **a, int **b){ int *temp; temp = *a; *a = *b; *b = temp; } /* ***************************************************************************** * * START - Refactored code * * ************************************************************************** */ bool utils::checkFile(ifstream &file) { /* Check if a given file exists and its size is greater than 0 */ long begin, end; /* Check whether input file exists or not */ if(!file) return false; /* Check input file sizes. A valid file should have a size grater than 0 */ begin = file.tellg(); file.seekg(0, ios::end); end = file.tellg(); file.seekg(0, ios::beg); /* Compare difference between file start and end. * Depending on result, return True or False */ if(!(end - begin)) return false; return true; } char* utils::readLine(ifstream &file) { /* Read a new line from current input stream. This function is better than * standard one since cares of operative system compability. It is useful * as well because remove tabs and blank spaces at lines beginning/ending */ int state; char c = ' '; string nline; static char *line = NULL; /* Check it the end of the file has been reached or not */ if(file.eof()) return NULL; /* Store first line found. For -Windows & MacOS compatibility- carriage return * is considered as well as a new line character */ for( ; (c != '\n') && ((!file.eof())); file.read(&c, 1)) if ((c != '\r')) nline.resize(nline.size() + 1, c); /* Remove blank spaces & tabs from the beginning of the line */ state = nline.find(" ", 0); while(state != (int) string::npos && state == 0) { nline.erase(state, 1); state = nline.find(" ", state); } state = nline.find("\t", 0); while(state != (int) string::npos && state == 0) { nline.erase(state, 1); state = nline.find("\t", state); } /* If there is nothing to return, give back a NULL pointer ... */ if(nline.size() == 0) return NULL; /* Otherwise, initialize the appropiate data structure, * dump the data and return it */ line = new char[nline.size() + 1]; strcpy(line, nline.c_str()); return line; } char* utils::trimLine(string nline) { /* This function is used to remove comments inbetween a biological sequence. * Remove all content surrounded by ("") or ([]). It wans as well when a * mismatch for these flags is found */ int pos, next; static char *line; /* Set-up lower and upper limit to look for comments inside of input string */ pos = -1; /* Identify comments inside of input sequence and remove it */ while(true) { pos = nline.find("\"", (pos + 1)); /* When there is not any more a comment inside of sequence, * go out from this loop */ if(pos == (int) string::npos) break; /* Look for closing flag */ next = nline.rfind("\"", nline.size()); /* If a pair of comments flags '"' is found, remove everything inbetween */ if((int) nline.find("\"", (pos + 1)) == next) { nline.erase(pos, (next - pos + 1)); pos = -1; } /* If there is only one flag '"' for comments inside of sequence, * user should be warned about that */ if (pos == next) { cerr << endl << "ERROR: Possible (\") mismatch for comments" << endl; return NULL; } } /* Look for other kind of comments, in this case those with [] */ while(true) { pos = -1; next = -1; /* Search for last opened bracket. It is supposed to be the first one for * being close */ while((pos = nline.find("[", (pos + 1))) != (int) string::npos) next = pos; /* If no opening bracket has been found. * Check if there is any closing one */ if (next == -1) { /* There are not any bracket in input string */ if ((int) nline.find("]", 0) == (int) string::npos) break; /* Otherwise, warn about the error */ cerr << endl << "ERROR: Brackets (]) mismatch found" << endl; return NULL; } /* Look for closest closing bracket to the opening one found */ pos = nline.find("]", (next + 1)); /* If no closing bracket has been found. Warn about the mismatch */ if (pos == (int) string::npos) { cerr << endl << "ERROR: Brackets ([) mismatch found" << endl; return NULL; } /* When both brackets have been found, remove comments inbetween them */ nline.erase(next, (pos - next + 1)); } /* Check if after removing all comments from input string there is still part * of sequences or not */ if(nline.size() == 0) return NULL; /* Initialize and store resulting sequence into an appropiate structure */ line = new char[nline.size() + 1]; strcpy(line, nline.c_str()); return line; } /* ***************************************************************************** * * END - Refactored code * * ************************************************************************** */ string utils::getReverse(string toReverse) { string line; int i; /* ***** ***** ***** ***** ***** ***** ***** ***** */ for(i = toReverse.size() - 1; i >= 0; i--) line += toReverse[i]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ return line; } string utils::removeCharacter(char c, string line) { int pos; /* ***** ***** ***** ***** ***** ***** ***** ***** */ pos = line.find(c, 0); while(pos != (int) string::npos) { line.erase(pos, 1); pos = line.find(c, pos); } /* ***** ***** ***** ***** ***** ***** ***** ***** */ return line; } int utils::checkTypeAlignment(int seqNumber, int residNumber, string *sequences) { int i, j, k, l, hitDNA, hitRNA, degenerate, gDNA, gRNA, extDNA, extRNA; float ratioDNA, ratioRNA; /* Standard tables */ char listRNA[11] = "AGCUNagcun"; char listDNA[11] = "AGCTNagctn"; /* Degenerate Nucleotides codes */ char degeneratedCodes[21] = "MmRrWwSsYyKkVvHhDdBb"; /* For each sequences, this method locks at the 100 letters (excluding gaps). * The method is able to distinguish between pure DNA/RNA nucleotides or those * containing degenerate Nucleotide letters */ for(i = 0, gDNA = 0, gRNA = 0, extDNA = 0, extRNA = 0; i < seqNumber; i++) { /* Looks at the 100 letters (excluding gaps) while doesn's get the sequence's end */ /* When there are less than a 100 characters, break the loop before reaching that limit */ residNumber = (int) sequences[i].size(); //~ for(j = 0, k = 0, hitDNA = 0, hitRNA = 0, degenerate = 0; j < residNumber && k < 100; j++) for(j = 0, k = 0, hitDNA = 0, hitRNA = 0, degenerate = 0; j < residNumber; j++) if(sequences[i][j] != '-' && sequences[i][j] != '.' && sequences[i][j] != '?') { k++; /* Recognizes between DNA and RNA. */ for(l = 0; l < (int) strlen(listDNA); l++) if(listDNA[l] == sequences[i][j]) hitDNA++; for(l = 0; l < (int) strlen(listRNA); l++) if(listRNA[l] == sequences[i][j]) hitRNA++; for(l = 0; l < (int) strlen(degeneratedCodes); l++) if(degeneratedCodes[l] == sequences[i][j]) degenerate++; } /* If input sequences have less than 95% of nucleotides, even when residues * are treated with degenerated codes, consider the input file as containing * amino-acidic sequences. */ ratioDNA = float(degenerate + hitDNA)/k; ratioRNA = float(degenerate + hitRNA)/k; if(ratioDNA < 0.95 && ratioRNA < 0.95) return AAType; /* Identify precisely if nucleotides sequences are DNA/RNA strict or * any degenerate code has been used in the sequence */ else if(hitRNA > hitDNA && degenerate == 0) gRNA++; else if(hitRNA > hitDNA && degenerate != 0) extRNA++; else if(hitRNA < hitDNA && degenerate == 0) gDNA++; else if(hitRNA < hitDNA && degenerate != 0) extDNA++; } /* Return the datatype with greater values, considering always degenerate * codes */ if (extDNA != 0 && extDNA > extRNA) return DNADeg; else if (extRNA != 0 && extDNA < extRNA) return RNADeg; else if(gRNA > gDNA) return RNAType; else return DNAType; } int* utils::readNumbers_StartEnd(string line) { int comma, nElems = 0; static int *numbers; comma = -1; while((comma = line.find(",", comma + 1)) != (int) string::npos) nElems += 2; //~ If there is more than two numbers separated by a comma, return NULL if(nElems != 2) return NULL; numbers = new int[2]; comma = line.find(",", 0); numbers[0] = atoi(line.substr(0, comma).c_str()); numbers[1] = atoi(line.substr(comma+1).c_str()); return numbers; } int* utils::readNumbers(string line) { int i, comma, separ, init, nElems = 0; static int *numbers; /* ***** ***** ***** ***** ***** ***** ***** ***** */ comma = -1; while((comma = line.find(",", comma + 1)) != (int) string::npos) nElems += 2; nElems += 2; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ numbers = new int[nElems + 1]; numbers[0] = nElems; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ init = 0; i = 1; /* ***** ***** ***** ***** ***** ***** ***** ***** */ do { /* ***** ***** ***** ***** ***** ***** ***** ***** */ comma = line.find(",", init); separ = line.find("-", init); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(((separ < comma) || (comma == (int) string::npos)) && (separ != (int) string::npos)) { numbers[i++] = atoi(line.substr(init, separ - init).c_str()); numbers[i++] = atoi(line.substr(separ+1, comma - separ - 1).c_str()); init = comma + 1; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ else if((separ > comma) || (separ == (int) string::npos)) { numbers[i++] = atoi(line.substr(init, comma - init).c_str()); numbers[i++] = atoi(line.substr(init, comma - init).c_str()); init = comma + 1; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(numbers[i-2] < 0) return NULL; if(numbers[i-1] < numbers[i-2]) return NULL; if(comma == (int) string::npos) break; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } while(true); /* ***** ***** ***** ***** ***** ***** ***** ***** */ return numbers; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } char utils::determineColor(char res, string column) { /* ***** ***** ***** ***** ***** ***** ***** ***** ***** */ if(toupper(res) == 'G') return 'o'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** */ else if(toupper(res) == 'P') return 'y'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** */ else if(res != '-') { switch(toupper(res)) { /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (W, L, V, I, M, F): {50%, p}{60%, wlvimafcyhp} */ case 87: case 76: case 86: case 73: case 77: case 70: if(lookForPattern(column, "p", 0.5)) return 'b'; else if(lookForPattern(column, "wlvimafcyhp", 0.6)) return 'b'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (A): {50%, p}{60%, wlvimafcyhp}{85% t,s,g} */ case 65: if(lookForPattern(column, "p", 0.5)) return 'b'; else if(lookForPattern(column, "wlvimafcyhp", 0.6)) return 'b'; else if(lookForPattern(column, "t", 0.85)) return 'b'; else if(lookForPattern(column, "s", 0.85)) return 'b'; else if(lookForPattern(column, "g", 0.85)) return 'b'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* BLUE: (C): {50%, p}{60%, wlvimafcyhp}{85% s} * PINK: (C): {85%, c} */ case 67: if(lookForPattern(column, "p", 0.5)) return 'b'; else if(lookForPattern(column, "wlvimafcyhp", 0.6)) return 'b'; else if(lookForPattern(column, "s", 0.85)) return 'b'; else if(lookForPattern(column, "c", 0.85)) return 'p'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (K, R): {60%, kr}{85%, q} */ case 75: case 82: if(lookForPattern(column, "kr", 0.6)) return 'r'; else if(lookForPattern(column, "q", 0.85)) return 'r'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (T): {50%, ts}{60%, wlvimafcyhp } */ case 84: if(lookForPattern(column, "ts", 0.5)) return 'g'; else if(lookForPattern(column, "wlvimafcyhp", 0.6)) return 'g'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (S): {50%, ts}{80%, wlvimafcyhp } */ case 83: if(lookForPattern(column, "ts", 0.5)) return 'g'; else if(lookForPattern(column, "wlvimafcyhp", 0.8)) return 'g'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (N): {50%, n}{85%, d } */ case 78: if(lookForPattern(column, "n", 0.5)) return 'g'; else if(lookForPattern(column, "d", 0.85)) return 'g'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (Q): {50%, qe}{60%, kr} */ case 81: if(lookForPattern(column, "qe", 0.5)) return 'g'; else if(lookForPattern(column, "kr", 0.6)) return 'g'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (D): {50%, de, n} */ case 68: if(lookForPattern(column, "de", 0.5)) return 'm'; else if(lookForPattern(column, "n", 0.5)) return 'm'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (E): {50%, de,qe} */ case 69: if(lookForPattern(column, "de", 0.5)) return 'm'; else if(lookForPattern(column, "qe", 0.5)) return 'm'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* (H,Y): {50%, p}{60%, wlvimafcyhp} */ case 72: case 89: if(lookForPattern(column, "p", 0.5)) return 'c'; else if(lookForPattern(column, "wlvimafcyhp", 0.5)) return 'c'; else return 'w'; /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ } } return 'w'; } bool utils::lookForPattern(string column, string dataset, float level) { float count = 0; int i, j; for(i = 0; i < (int) column.size(); i++) { for(j = 0; j < (int) dataset.size(); j++) { if(toupper(column[i]) == toupper(dataset[j])) { count++; break; } } } if((count/column.size()) >= level) return true; else return false; }