prims_metabolomics: rankfilter_GCMS/pdfread.py comparison

comparison rankfilter_GCMS/pdfread.py @ 61:d685210eef3e

fix in pdftotabular tool

author	pieter.lukasse@wur.nl
date	Fri, 19 Dec 2014 15:30:13 +0100
parents	35f506f30ae4
children

comparison

equal deleted inserted replaced

-:35f506f30ae4
+:d685210eef3e
 if print_progress == True:
 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
 line_id += 1
 if len(cell) == 7:  # the compound has CAS number
 if len(cell[1].split(':')) == 2:
-forward.append(cell[1].split(':')[1])
+forward.append((cell[1].split(':')[1]).strip())
 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
 if len(cell[0].split(':')) > 2:
 name_tmp = ':'.join(cell[0].split(':')[1:])
 else:
 name_tmp = cell[0].split(':')[1]
-# uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive
+name.append(name_tmp.replace("  ", " ").strip())
-# replaces of known cases by the same with a white space:
+name_tmp = name_tmp.strip().split(' ')
-name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
-name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
-name_tmp = name_tmp.replace(', LC', ', L C').replace(', DC', ', D C')
-name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
 if name_tmp:
-if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
+# if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
-formule = (name_tmp.split(' ')[-1])
+if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
+formule = (name_tmp[-1])
 else:
 formule = ('not_def')
 else:
 formule = ('not_def')
 formula.append(formule.replace("  ", " "))
-reverse.append(cell[2].split(':')[1])
+reverse.append((cell[2].split(':')[1]).strip())
 prob.append(cell[3].split(' ')[2].replace('%', ''))
-cas.append(cell[4].split(':')[1])
+cas.append((cell[4].split(':')[1]).strip())
-lib_id.append(cell[5].split(':')[1])
+lib_id.append((cell[5].split(':')[1]).strip())
 nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
 j = j + 1
 else:
 missed_compounds.append(hh)
 rt_missed_compounds.append(spec_id)
-elif len(cell) >= 6:  # the compound has no CAS number
+elif len(cell) == 6:  # the compound has no CAS number
 if len(cell[1].split(':')) == 2:
-forward.append(cell[1].split(':')[1])
+forward.append((cell[1].split(':')[1]).strip())
 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
 if len(cell[0].split(':')) > 2:
 name_tmp = ':'.join(cell[0].split(':')[1:])
 else:
 name_tmp = cell[0].split(':')[1]
-name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
-name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
+name.append(name_tmp.replace("  ", " ").strip())
-name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))  # "  ", " "
 name_tmp = name_tmp.strip().split(' ')
 if name_tmp:
+# if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
 formule = (name_tmp[-1])
 else:
 formule = ('not_def')
 else:
 formule = ('not_def')
 formula.append(formule.replace("  ", " "))
-reverse.append(cell[2].split(':')[1])
+reverse.append((cell[2].split(':')[1]).strip())
 prob.append(cell[3].split(' ')[2].replace('%', ''))
 cas.append('undef')
-lib_id.append(cell[4].split(':')[1])
+lib_id.append((cell[4].split(':')[1]).strip())
 nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
 j = j + 1
 else:
 missed_compounds.append(hh)
 rt_missed_compounds.append(spec_id)
 else: # Missing columns, report and quit
+missed_compounds.append(hh)
-return
+rt_missed_compounds.append(spec_id)
 for _ in range(j):
 hitid.append(str(spec_id.replace("  ", " ")))
 rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
 output_fh.write(Hitlist_as_text)
 output_fh.close()
 out_missed_pdf = open(error_file, 'wb')
 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
+out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n")
 out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
 out_missed_pdf.close()
 def read_tabular(in_csv):

Mercurial > repos > pieterlukasse > prims_metabolomics

comparison rankfilter_GCMS/pdfread.py @ 61:d685210eef3e