prims_metabolomics: rankfilter_GCMS/pdfread.py comparison

comparison rankfilter_GCMS/pdfread.py @ 0:9d5f4f5f764b

Initial commit to toolshed

author	pieter.lukasse@wur.nl
date	Thu, 16 Jan 2014 13:10:00 +0100
parents
children	637830ac8bcd

comparison

equal deleted inserted replaced

--1:000000000000
+:9d5f4f5f764b
+"""
+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+import sys
+import csv
+def getPDF(filename, print_progress):
+'''
+Parses NIST PDF file
+@param filename: PDF file to parse
+'''
+NistInput = {}
+NistInput_missed = {}
+nist_input = open(filename, 'r').read()
+hitid = []
+rt = []
+name = []
+forward = []
+cas = []
+reverse = []
+prob = []
+lib_id = []
+nist_id = []
+missed_compounds = []
+rt_missed_compounds = []
+formula = []
+hit_list = nist_input.split('** Search Report Page 1 of 1 **')
+hit_list.pop(0)
+#number_hits = range(10)
+line_id = 0
+for line in hit_list:
+line = line.strip().translate(None, '\r')
+if line != '':
+hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')
+spec_id = hits.pop(0).split(' ')[1]
+j = 0
+for hh in hits:
+cell = hh.split(';')
+if print_progress == True:
+print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
+line_id += 1
+if len(cell) == 7:  # the compound has CAS number
+if len(cell[1].split(':')) == 2:
+forward.append(cell[1].split(':')[1])
+# indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+if len(cell[0].split(':')) > 2:
+name_tmp = ':'.join(cell[0].split(':')[1:])
+else:
+name_tmp = cell[0].split(':')[1]
+name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
+name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
+name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
+if name_tmp:
+if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
+formule = (name_tmp.split(' ')[-1])
+else:
+formule = ('not_def')
+else:
+formule = ('not_def')
+formula.append(formule.replace("  ", " "))
+reverse.append(cell[2].split(':')[1])
+prob.append(cell[3].split(' ')[2].replace('%', ''))
+cas.append(cell[4].split(':')[1])
+lib_id.append(cell[5].split(':')[1])
+nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
+j = j + 1
+else:
+missed_compounds.append(hh)
+rt_missed_compounds.append(spec_id)
+elif len(cell) >= 6:  # the compound has no CAS number
+if len(cell[1].split(':')) == 2:
+forward.append(cell[1].split(':')[1])
+# indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+if len(cell[0].split(':')) > 2:
+name_tmp = ':'.join(cell[0].split(':')[1:])
+else:
+name_tmp = cell[0].split(':')[1]
+name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
+name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
+name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))  # "  ", " "
+name_tmp = name_tmp.strip().split(' ')
+if name_tmp:
+if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
+formule = (name_tmp[-1])
+else:
+formule = ('not_def')
+else:
+formule = ('not_def')
+formula.append(formule.replace("  ", " "))
+reverse.append(cell[2].split(':')[1])
+prob.append(cell[3].split(' ')[2].replace('%', ''))
+cas.append('undef')
+lib_id.append(cell[4].split(':')[1])
+nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
+j = j + 1
+else:
+missed_compounds.append(hh)
+rt_missed_compounds.append(spec_id)
+else: # Missing columns, report and quit
+return
+for _ in range(j):
+hitid.append(str(spec_id.replace("  ", " ")))
+rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
+NistInput['ID'] = hitid
+NistInput['R.T.'] = rt
+NistInput['Name'] = name
+NistInput['CAS'] = cas
+NistInput['Formula'] = formula
+NistInput['Forward'] = forward
+NistInput['Reverse'] = reverse
+NistInput['Probability'] = prob
+NistInput['Library'] = lib_id
+NistInput['Library ID'] = nist_id
+NistInput_missed['Missed Compounds'] = missed_compounds
+NistInput_missed['RT missed Compounds'] = rt_missed_compounds
+return NistInput, NistInput_missed
+def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
+'''
+Converts NIST PDF file to tabular format
+@param filename: PDF file to parse
+@param output_file: output file for the hits
+@param error_file: output file for failed hits
+'''
+[HitList, HitList_missed] = getPDF(filename, print_progress)
+# save Hitlist as tab seperate file
+Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
+Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
+Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
+output_fh = open(output_file, 'wb')
+output_fh.write(Hitlist_as_text)
+output_fh.close()
+out_missed_pdf = open(error_file, 'wb')
+for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
+out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
+out_missed_pdf.close()
+def read_tabular(in_csv):
+'''
+Parses a tab-separated file returning a dictionary with named columns
+@param in_csv: input filename to be parsed
+'''
+data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+header = data.pop(0)
+# Create dictionary with column name as key
+output = {}
+for index in xrange(len(header)):
+output[header[index]] = [row[index] for row in data]
+return output
+def read_tabular_old(filename):
+'''
+Function to read tabular format (created by convert_pdftotext2tabular)
+and output a dict with header of columns as key and value is columns of tabular as list
+@param filename: tabular file to read
+'''
+input_fh = None
+try:
+input_fh = open(filename, 'r')
+except IOError, error:
+raise error
+colnames = input_fh.readline().strip().split('\t')
+cells = []
+for line in input_fh.readlines():
+cells.append(line.strip().split('\t'))
+#transform from row oriented structure to column oriented structure
+cells = zip(*cells)
+#store the list of list in form of final output
+RankFilterGC_format = {}
+for colnumber in range(len(colnames)):
+RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
+return RankFilterGC_format
+if __name__ == '__main__':
+convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)

Mercurial > repos > pieterlukasse > prims_metabolomics

comparison rankfilter_GCMS/pdfread.py @ 0:9d5f4f5f764b