Mercurial > repos > pieterlukasse > prims_metabolomics
diff rankfilter_GCMS/pdftotabular.py @ 0:9d5f4f5f764b
Initial commit to toolshed
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 16 Jan 2014 13:10:00 +0100 |
parents | |
children | d685210eef3e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/pdftotabular.py Thu Jan 16 13:10:00 2014 +0100 @@ -0,0 +1,41 @@ +""" +Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this software except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +import sys +import pdfread +from subprocess import call + + +def convert_pdftotext(filename, output_file): + ''' + Converts PDF file to text + @param filename: PDF file to parse + @param output_file: output text file for the hits + ''' + + try: + call(["pdftotext", filename, output_file]) + except: + raise Exception("Error while trying to convert PDF to text") + + + + +if __name__ == '__main__': + pdf_as_text = sys.argv[1]+".txt" + convert_pdftotext(sys.argv[1], pdf_as_text) + pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False)