# HG changeset patch # User linda.bakker@wur.nl # Date 1426867864 -3600 # Node ID 05ff1c55db84776913547e407dc832d16e40ed34 # Parent 346ff9ad8c7abe05934b6c5ea2fe4236227b5ab7 fix for rankfilter, removed pfd read functional diff -r 346ff9ad8c7a -r 05ff1c55db84 rankfilter_GCMS/pdfread.py --- a/rankfilter_GCMS/pdfread.py Fri Mar 20 17:10:04 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,214 +0,0 @@ -""" -Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - -import sys -import csv - -def getPDF(filename, print_progress): - ''' - Parses NIST PDF file - @param filename: PDF file to parse - ''' - NistInput = {} - NistInput_missed = {} - nist_input = open(filename, 'r').read() - - hitid = [] - rt = [] - name = [] - forward = [] - cas = [] - reverse = [] - prob = [] - lib_id = [] - nist_id = [] - missed_compounds = [] - id_missed_compounds = [] - formula = [] - - hit_list = nist_input.split('** Search Report Page 1 of 1 **') - hit_list.pop(0) - #number_hits = range(10) - line_id = 0 - for line in hit_list: - line = line.strip().translate(None, '\r') - if line != '': - hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! - #strange....code seems fine actually...debug! See test/data/download.pdf - # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux? - spec_id = hits.pop(0).split(' ')[1] - j = 0 - for hh in hits: - cell = hh.split(';') - if print_progress == True: - print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell - line_id += 1 - if len(cell) == 7: # the compound has CAS number - if len(cell[1].split(':')) == 2: - forward.append((cell[1].split(':')[1]).strip()) - # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end - if len(cell[0].split(':')) > 2: - name_tmp = ':'.join(cell[0].split(':')[1:]) - else: - name_tmp = cell[0].split(':')[1] - - name.append(name_tmp.replace(" ", " ").strip()) - name_tmp = name_tmp.strip().split(' ') - if name_tmp: - # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: - if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': - formule = (name_tmp[-1]) - else: - formule = ('not_def') - else: - formule = ('not_def') - formula.append(formule.replace(" ", " ")) - reverse.append((cell[2].split(':')[1]).strip()) - prob.append(cell[3].split(' ')[2].replace('%', '')) - cas.append((cell[4].split(':')[1]).strip()) - lib_id.append((cell[5].split(':')[1]).strip()) - nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) - j = j + 1 - else: - missed_compounds.append(hh) - id_missed_compounds.append(spec_id) - - elif len(cell) == 6: # the compound has no CAS number - if len(cell[1].split(':')) == 2: - - forward.append((cell[1].split(':')[1]).strip()) - # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end - if len(cell[0].split(':')) > 2: - name_tmp = ':'.join(cell[0].split(':')[1:]) - else: - name_tmp = cell[0].split(':')[1] - - name.append(name_tmp.replace(" ", " ").strip()) - name_tmp = name_tmp.strip().split(' ') - if name_tmp: - # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: - if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': - formule = (name_tmp[-1]) - else: - formule = ('not_def') - else: - formule = ('not_def') - formula.append(formule.replace(" ", " ")) - reverse.append((cell[2].split(':')[1]).strip()) - prob.append(cell[3].split(' ')[2].replace('%', '')) - cas.append('undef') - lib_id.append((cell[4].split(':')[1]).strip()) - nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) - j = j + 1 - - else: - missed_compounds.append(hh) - id_missed_compounds.append(spec_id) - - else: # Missing columns, report and quit - missed_compounds.append(hh) - id_missed_compounds.append(spec_id) - - for _ in range(j): - hitid.append(str(spec_id.replace(" ", " "))) - #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit: - rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) - - NistInput['ID'] = hitid - NistInput['R.T.'] = rt - NistInput['Name'] = name - NistInput['CAS'] = cas - NistInput['Formula'] = formula - NistInput['Forward'] = forward - NistInput['Reverse'] = reverse - NistInput['Probability'] = prob - NistInput['Library'] = lib_id - NistInput['Library ID'] = nist_id - NistInput_missed['Missed Compounds'] = missed_compounds - NistInput_missed['ID missed Compounds'] = id_missed_compounds - - return NistInput, NistInput_missed - - -def convert_pdftotext2tabular(filename, output_file, error_file, print_progress): - ''' - Converts NIST PDF file to tabular format - @param filename: PDF file to parse - @param output_file: output file for the hits - @param error_file: output file for failed hits - ''' - [HitList, HitList_missed] = getPDF(filename, print_progress) - # save Hitlist as tab seperate file - Hitlist_as_text = "\t".join(HitList.keys()) + "\n" - Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()]) - Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)])) - output_fh = open(output_file, 'wb') - output_fh.write(Hitlist_as_text) - output_fh.close() - - out_missed_pdf = open(error_file, 'wb') - for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['ID missed Compounds']): - out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n") - out_missed_pdf.write('%s\n' % '\t'.join([y, x])) - out_missed_pdf.close() - - -def read_tabular(in_csv): - ''' - Parses a tab-separated file returning a dictionary with named columns - @param in_csv: input filename to be parsed - ''' - data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) - header = data.pop(0) - # Create dictionary with column name as key - output = {} - for index in xrange(len(header)): - output[header[index]] = [row[index] for row in data] - return output - - -def read_tabular_old(filename): - ''' - Function to read tabular format (created by convert_pdftotext2tabular) - and output a dict with header of columns as key and value is columns of tabular as list - @param filename: tabular file to read - ''' - input_fh = None - try: - input_fh = open(filename, 'r') - except IOError, error: - raise error - colnames = input_fh.readline().strip().split('\t') - cells = [] - for line in input_fh.readlines(): - cells.append(line.strip().split('\t')) - #transform from row oriented structure to column oriented structure - cells = zip(*cells) - #store the list of list in form of final output - RankFilterGC_format = {} - for colnumber in range(len(colnames)): - RankFilterGC_format[colnames[colnumber]] = cells[colnumber] - return RankFilterGC_format - - -if __name__ == '__main__': - convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)