Mercurial > repos > pieterlukasse > prims_metabolomics
view rankfilter_GCMS/pdfread.py @ 57:963684611ccb
fix for xcms support in msclust
author | pieter.lukasse@wur.nl |
---|---|
date | Fri, 12 Dec 2014 12:06:36 +0100 |
parents | 637830ac8bcd |
children | 35f506f30ae4 |
line wrap: on
line source
""" Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import sys import csv def getPDF(filename, print_progress): ''' Parses NIST PDF file @param filename: PDF file to parse ''' NistInput = {} NistInput_missed = {} nist_input = open(filename, 'r').read() hitid = [] rt = [] name = [] forward = [] cas = [] reverse = [] prob = [] lib_id = [] nist_id = [] missed_compounds = [] rt_missed_compounds = [] formula = [] hit_list = nist_input.split('** Search Report Page 1 of 1 **') hit_list.pop(0) #number_hits = range(10) line_id = 0 for line in hit_list: line = line.strip().translate(None, '\r') if line != '': hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! #strange....code seems fine actually...debug! See test/data/download.pdf # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux? spec_id = hits.pop(0).split(' ')[1] j = 0 for hh in hits: cell = hh.split(';') if print_progress == True: print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell line_id += 1 if len(cell) == 7: # the compound has CAS number if len(cell[1].split(':')) == 2: forward.append(cell[1].split(':')[1]) # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end if len(cell[0].split(':')) > 2: name_tmp = ':'.join(cell[0].split(':')[1:]) else: name_tmp = cell[0].split(':')[1] # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive # replaces of known cases by the same with a white space: name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') name_tmp = name_tmp.replace('-, LC', '-, L C').replace('-, DC', '-, D C') name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) if name_tmp: if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H': formule = (name_tmp.split(' ')[-1]) else: formule = ('not_def') else: formule = ('not_def') formula.append(formule.replace(" ", " ")) reverse.append(cell[2].split(':')[1]) prob.append(cell[3].split(' ')[2].replace('%', '')) cas.append(cell[4].split(':')[1]) lib_id.append(cell[5].split(':')[1]) nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) j = j + 1 else: missed_compounds.append(hh) rt_missed_compounds.append(spec_id) elif len(cell) >= 6: # the compound has no CAS number if len(cell[1].split(':')) == 2: forward.append(cell[1].split(':')[1]) # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end if len(cell[0].split(':')) > 2: name_tmp = ':'.join(cell[0].split(':')[1:]) else: name_tmp = cell[0].split(':')[1] name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " " name_tmp = name_tmp.strip().split(' ') if name_tmp: if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': formule = (name_tmp[-1]) else: formule = ('not_def') else: formule = ('not_def') formula.append(formule.replace(" ", " ")) reverse.append(cell[2].split(':')[1]) prob.append(cell[3].split(' ')[2].replace('%', '')) cas.append('undef') lib_id.append(cell[4].split(':')[1]) nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) j = j + 1 else: missed_compounds.append(hh) rt_missed_compounds.append(spec_id) else: # Missing columns, report and quit return for _ in range(j): hitid.append(str(spec_id.replace(" ", " "))) rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) NistInput['ID'] = hitid NistInput['R.T.'] = rt NistInput['Name'] = name NistInput['CAS'] = cas NistInput['Formula'] = formula NistInput['Forward'] = forward NistInput['Reverse'] = reverse NistInput['Probability'] = prob NistInput['Library'] = lib_id NistInput['Library ID'] = nist_id NistInput_missed['Missed Compounds'] = missed_compounds NistInput_missed['RT missed Compounds'] = rt_missed_compounds return NistInput, NistInput_missed def convert_pdftotext2tabular(filename, output_file, error_file, print_progress): ''' Converts NIST PDF file to tabular format @param filename: PDF file to parse @param output_file: output file for the hits @param error_file: output file for failed hits ''' [HitList, HitList_missed] = getPDF(filename, print_progress) # save Hitlist as tab seperate file Hitlist_as_text = "\t".join(HitList.keys()) + "\n" Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()]) Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)])) output_fh = open(output_file, 'wb') output_fh.write(Hitlist_as_text) output_fh.close() out_missed_pdf = open(error_file, 'wb') for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): out_missed_pdf.write('%s\n' % '\t'.join([y, x])) out_missed_pdf.close() def read_tabular(in_csv): ''' Parses a tab-separated file returning a dictionary with named columns @param in_csv: input filename to be parsed ''' data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) header = data.pop(0) # Create dictionary with column name as key output = {} for index in xrange(len(header)): output[header[index]] = [row[index] for row in data] return output def read_tabular_old(filename): ''' Function to read tabular format (created by convert_pdftotext2tabular) and output a dict with header of columns as key and value is columns of tabular as list @param filename: tabular file to read ''' input_fh = None try: input_fh = open(filename, 'r') except IOError, error: raise error colnames = input_fh.readline().strip().split('\t') cells = [] for line in input_fh.readlines(): cells.append(line.strip().split('\t')) #transform from row oriented structure to column oriented structure cells = zip(*cells) #store the list of list in form of final output RankFilterGC_format = {} for colnumber in range(len(colnames)): RankFilterGC_format[colnames[colnumber]] = cells[colnumber] return RankFilterGC_format if __name__ == '__main__': convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)