Repository 'prims_metabolomics2'
hg clone https://toolshed.g2.bx.psu.edu/repos/pieterlukasse/prims_metabolomics2

Changeset 15:05ff1c55db84 (2015-03-20)
Previous changeset 14:346ff9ad8c7a (2015-03-20) Next changeset 16:fe4682eb938c (2015-03-23)
Commit message:
fix for rankfilter, removed pfd read functional
removed:
rankfilter_GCMS/pdfread.py
b
diff -r 346ff9ad8c7a -r 05ff1c55db84 rankfilter_GCMS/pdfread.py
--- a/rankfilter_GCMS/pdfread.py Fri Mar 20 17:10:04 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,214 +0,0 @@\n-"""\n-Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University \n-\n-Permission is hereby granted, free of charge, to any person obtaining a copy\n-of this software and associated documentation files (the "Software"), to deal\n-in the Software without restriction, including without limitation the rights\n-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n-copies of the Software, and to permit persons to whom the Software is\n-furnished to do so, subject to the following conditions:\n-\n-The above copyright notice and this permission notice shall be included in\n-all copies or substantial portions of the Software.\n-\n-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n-THE SOFTWARE.\n-"""\n-\n-import sys\n-import csv\n-\n-def getPDF(filename, print_progress):\n-    \'\'\'\n-    Parses NIST PDF file\n-    @param filename: PDF file to parse\n-    \'\'\'\n-    NistInput = {}\n-    NistInput_missed = {}\n-    nist_input = open(filename, \'r\').read()\n-\n-    hitid = []\n-    rt = []\n-    name = []\n-    forward = []\n-    cas = []\n-    reverse = []\n-    prob = []\n-    lib_id = []\n-    nist_id = []\n-    missed_compounds = []\n-    id_missed_compounds = []\n-    formula = []\n-\n-    hit_list = nist_input.split(\'** Search Report Page 1 of 1 **\')\n-    hit_list.pop(0)\n-    #number_hits = range(10)\n-    line_id = 0\n-    for line in hit_list:\n-        line = line.strip().translate(None, \'\\r\')\n-        if line != \'\':\n-            hits = line.replace(\'\\n\', \' \').replace(\'\\x0c\', \'\').replace(\'^L\', \'\').split(\'Hit\')  #solution? : if we wouldn\'t replace the \\n by \' \' but by some special sign, then reading formula would be simpler! \n-                                                                                                #strange....code seems fine actually...debug! See test/data/download.pdf \n-                                                                                                # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?\n-            spec_id = hits.pop(0).split(\' \')[1]\n-            j = 0\n-            for hh in hits:\n-                cell = hh.split(\';\')\n-                if print_progress == True:\n-                    print \'Processing line: \', line_id, \' with length: \', len(cell), \':\\n\\t\', cell\n-                line_id += 1\n-                if len(cell) == 7:  # the compound has CAS number\n-                    if len(cell[1].split(\':\')) == 2:\n-                        forward.append((cell[1].split(\':\')[1]).strip())\n-                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end\n-                        if len(cell[0].split(\':\')) > 2:\n-                            name_tmp = \':\'.join(cell[0].split(\':\')[1:])\n-                        else:\n-                            name_tmp = cell[0].split(\':\')[1]\n-                            \n-                        name.append(name_tmp.replace("  ", " ").strip())\n-                        name_tmp = name_tmp.strip().split(\' \')\n-                        if name_tmp:\n-                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:\n-                            if name_tmp[-1][0] == \'C\' or name_tmp[-1][0] == \'F\' or name_tmp[-1][0] == \'H\':\n-                                formule = (name_tmp[-1])\n-                            else:\n-                                formule = (\'not_def\')\n-                        else:\n-                   '..b'nd(formule.replace("  ", " "))\n-                        reverse.append((cell[2].split(\':\')[1]).strip())\n-                        prob.append(cell[3].split(\' \')[2].replace(\'%\', \'\'))\n-                        cas.append(\'undef\')\n-                        lib_id.append((cell[4].split(\':\')[1]).strip())\n-                        nist_id.append(cell[5].split(\':\')[1].replace(\'.\', \'\').strip())\n-                        j = j + 1\n-\n-                    else:\n-                        missed_compounds.append(hh)\n-                        id_missed_compounds.append(spec_id)\n-\n-                else: # Missing columns, report and quit\n-                    missed_compounds.append(hh)\n-                    id_missed_compounds.append(spec_id)\n-\n-            for _ in range(j):\n-                hitid.append(str(spec_id.replace("  ", " ")))\n-                #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit:\n-                rt.append(str(float(spec_id.split(\'-\')[3]) / 1e+06))\n-\n-    NistInput[\'ID\'] = hitid\n-    NistInput[\'R.T.\'] = rt\n-    NistInput[\'Name\'] = name\n-    NistInput[\'CAS\'] = cas\n-    NistInput[\'Formula\'] = formula\n-    NistInput[\'Forward\'] = forward\n-    NistInput[\'Reverse\'] = reverse\n-    NistInput[\'Probability\'] = prob\n-    NistInput[\'Library\'] = lib_id\n-    NistInput[\'Library ID\'] = nist_id\n-    NistInput_missed[\'Missed Compounds\'] = missed_compounds\n-    NistInput_missed[\'ID missed Compounds\'] = id_missed_compounds\n-\n-    return NistInput, NistInput_missed\n-\n-\n-def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):\n-    \'\'\'\n-    Converts NIST PDF file to tabular format\n-    @param filename: PDF file to parse\n-    @param output_file: output file for the hits\n-    @param error_file: output file for failed hits\n-    \'\'\'\n-    [HitList, HitList_missed] = getPDF(filename, print_progress)\n-    # save Hitlist as tab seperate file\n-    Hitlist_as_text = "\\t".join(HitList.keys()) + "\\n"\n-    Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])\n-    Hitlist_as_text += str("\\n".join(["\\t".join(e) for e in zip(*Hitlist_array_of_array)]))\n-    output_fh = open(output_file, \'wb\')\n-    output_fh.write(Hitlist_as_text)\n-    output_fh.close()\n-\n-    out_missed_pdf = open(error_file, \'wb\')\n-    for x, y in zip(HitList_missed[\'Missed Compounds\'], HitList_missed[\'ID missed Compounds\']):\n-        out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\\n")\n-        out_missed_pdf.write(\'%s\\n\' % \'\\t\'.join([y, x]))\n-    out_missed_pdf.close()\n-\n-\n-def read_tabular(in_csv):\n-    \'\'\'\n-    Parses a tab-separated file returning a dictionary with named columns\n-    @param in_csv: input filename to be parsed\n-    \'\'\'\n-    data = list(csv.reader(open(in_csv, \'rU\'), delimiter=\'\\t\'))\n-    header = data.pop(0)\n-    # Create dictionary with column name as key\n-    output = {}\n-    for index in xrange(len(header)):\n-        output[header[index]] = [row[index] for row in data]\n-    return output\n-\n-\n-def read_tabular_old(filename):\n-    \'\'\'\n-    Function to read tabular format (created by convert_pdftotext2tabular)\n-    and output a dict with header of columns as key and value is columns of tabular as list\n-    @param filename: tabular file to read\n-    \'\'\'\n-    input_fh = None\n-    try:\n-        input_fh = open(filename, \'r\')\n-    except IOError, error:\n-        raise error\n-    colnames = input_fh.readline().strip().split(\'\\t\')\n-    cells = []\n-    for line in input_fh.readlines():\n-        cells.append(line.strip().split(\'\\t\'))\n-    #transform from row oriented structure to column oriented structure\n-    cells = zip(*cells)\n-    #store the list of list in form of final output\n-    RankFilterGC_format = {}\n-    for colnumber in range(len(colnames)):\n-        RankFilterGC_format[colnames[colnumber]] = cells[colnumber]\n-    return RankFilterGC_format\n-\n-\n-if __name__ == \'__main__\':\n-    convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)\n'