view rankfilter_GCMS/pdfread.py @ 60:35f506f30ae4

fixed small rule in pdfread, and other small enhancements
author pieter.lukasse@wur.nl
date Fri, 19 Dec 2014 11:30:22 +0100
parents 637830ac8bcd
children d685210eef3e
line wrap: on
line source

"""
Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University 

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""

import sys
import csv

def getPDF(filename, print_progress):
    '''
    Parses NIST PDF file
    @param filename: PDF file to parse
    '''
    NistInput = {}
    NistInput_missed = {}
    nist_input = open(filename, 'r').read()

    hitid = []
    rt = []
    name = []
    forward = []
    cas = []
    reverse = []
    prob = []
    lib_id = []
    nist_id = []
    missed_compounds = []
    rt_missed_compounds = []
    formula = []

    hit_list = nist_input.split('** Search Report Page 1 of 1 **')
    hit_list.pop(0)
    #number_hits = range(10)
    line_id = 0
    for line in hit_list:
        line = line.strip().translate(None, '\r')
        if line != '':
            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')  #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! 
                                                                                                #strange....code seems fine actually...debug! See test/data/download.pdf 
                                                                                                # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?
            spec_id = hits.pop(0).split(' ')[1]
            j = 0
            for hh in hits:
                cell = hh.split(';')
                if print_progress == True:
                    print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
                line_id += 1
                if len(cell) == 7:  # the compound has CAS number
                    if len(cell[1].split(':')) == 2:
                        forward.append(cell[1].split(':')[1])
                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
                        if len(cell[0].split(':')) > 2:
                            name_tmp = ':'.join(cell[0].split(':')[1:])
                        else:
                            name_tmp = cell[0].split(':')[1]
                            
                        # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive
                        # replaces of known cases by the same with a white space:    
                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
                        name_tmp = name_tmp.replace(', LC', ', L C').replace(', DC', ', D C')
                        
                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
                        if name_tmp:
                            if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
                                formule = (name_tmp.split(' ')[-1])
                            else:
                                formule = ('not_def')
                        else:
                            formule = ('not_def')
                        formula.append(formule.replace("  ", " "))
                        reverse.append(cell[2].split(':')[1])
                        prob.append(cell[3].split(' ')[2].replace('%', ''))
                        cas.append(cell[4].split(':')[1])
                        lib_id.append(cell[5].split(':')[1])
                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
                        j = j + 1
                    else:
                        missed_compounds.append(hh)
                        rt_missed_compounds.append(spec_id)

                elif len(cell) >= 6:  # the compound has no CAS number
                    if len(cell[1].split(':')) == 2:

                        forward.append(cell[1].split(':')[1])
                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
                        if len(cell[0].split(':')) > 2:
                            name_tmp = ':'.join(cell[0].split(':')[1:])
                        else:
                            name_tmp = cell[0].split(':')[1]
                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))  # "  ", " "
                        name_tmp = name_tmp.strip().split(' ')
                        if name_tmp:
                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
                                formule = (name_tmp[-1])
                            else:
                                formule = ('not_def')
                        else:
                            formule = ('not_def')
                        formula.append(formule.replace("  ", " "))
                        reverse.append(cell[2].split(':')[1])
                        prob.append(cell[3].split(' ')[2].replace('%', ''))
                        cas.append('undef')
                        lib_id.append(cell[4].split(':')[1])
                        nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
                        j = j + 1

                    else:
                        missed_compounds.append(hh)
                        rt_missed_compounds.append(spec_id)

                else: # Missing columns, report and quit
                    
                    return

            for _ in range(j):
                hitid.append(str(spec_id.replace("  ", " ")))
                rt.append(str(float(spec_id.split('-')[3]) / 1e+06))

    NistInput['ID'] = hitid
    NistInput['R.T.'] = rt
    NistInput['Name'] = name
    NistInput['CAS'] = cas
    NistInput['Formula'] = formula
    NistInput['Forward'] = forward
    NistInput['Reverse'] = reverse
    NistInput['Probability'] = prob
    NistInput['Library'] = lib_id
    NistInput['Library ID'] = nist_id
    NistInput_missed['Missed Compounds'] = missed_compounds
    NistInput_missed['RT missed Compounds'] = rt_missed_compounds

    return NistInput, NistInput_missed


def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
    '''
    Converts NIST PDF file to tabular format
    @param filename: PDF file to parse
    @param output_file: output file for the hits
    @param error_file: output file for failed hits
    '''
    [HitList, HitList_missed] = getPDF(filename, print_progress)
    # save Hitlist as tab seperate file
    Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
    Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
    Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
    output_fh = open(output_file, 'wb')
    output_fh.write(Hitlist_as_text)
    output_fh.close()

    out_missed_pdf = open(error_file, 'wb')
    for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
        out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
    out_missed_pdf.close()


def read_tabular(in_csv):
    '''
    Parses a tab-separated file returning a dictionary with named columns
    @param in_csv: input filename to be parsed
    '''
    data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
    header = data.pop(0)
    # Create dictionary with column name as key
    output = {}
    for index in xrange(len(header)):
        output[header[index]] = [row[index] for row in data]
    return output


def read_tabular_old(filename):
    '''
    Function to read tabular format (created by convert_pdftotext2tabular)
    and output a dict with header of columns as key and value is columns of tabular as list
    @param filename: tabular file to read
    '''
    input_fh = None
    try:
        input_fh = open(filename, 'r')
    except IOError, error:
        raise error
    colnames = input_fh.readline().strip().split('\t')
    cells = []
    for line in input_fh.readlines():
        cells.append(line.strip().split('\t'))
    #transform from row oriented structure to column oriented structure
    cells = zip(*cells)
    #store the list of list in form of final output
    RankFilterGC_format = {}
    for colnumber in range(len(colnames)):
        RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
    return RankFilterGC_format


if __name__ == '__main__':
    convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)